refactor

updated reqs from clusteing
2025-08-11 02:37:21 +01:00 · 2025-08-11 02:22:59 +01:00
11 changed files with 1120 additions and 233 deletions
--- a/apps/cluster_map/cluster.py
+++ b/apps/cluster_map/cluster.py
@@ -0,0 +1,12 @@
 """
 Discord Chat Embeddings Visualizer - Legacy Entry Point
 This file serves as a compatibility layer for the original cluster.py.
 The application has been refactored into modular components for better maintainability.
 """
 # Import and run the main application
 from main import main
 if __name__ == "__main__":
    main()
--- a/apps/cluster_map/clustering.py
+++ b/apps/cluster_map/clustering.py
@@ -0,0 +1,99 @@
 """
 Clustering algorithms and evaluation metrics.
 """
 import numpy as np
 import streamlit as st
 from sklearn.cluster import SpectralClustering, AgglomerativeClustering, OPTICS
 from sklearn.mixture import GaussianMixture
 from sklearn.preprocessing import StandardScaler
 from sklearn.metrics import silhouette_score, calinski_harabasz_score
 import hdbscan
 from config import DEFAULT_RANDOM_STATE
 def apply_clustering(embeddings, clustering_method="None", n_clusters=5):
    """
    Apply clustering algorithm to embeddings and return labels and metrics.
    Args:
        embeddings: High-dimensional embeddings to cluster
        clustering_method: Name of clustering algorithm
        n_clusters: Number of clusters (for methods that require it)
    Returns:
        tuple: (cluster_labels, silhouette_score, calinski_harabasz_score)
    """
    if clustering_method == "None" or len(embeddings) <= n_clusters:
        return None, None, None
    # Standardize embeddings for better clustering
    scaler = StandardScaler()
    scaled_embeddings = scaler.fit_transform(embeddings)
    cluster_labels = None
    silhouette_avg = None
    calinski_harabasz = None
    try:
        if clustering_method == "HDBSCAN":
            min_cluster_size = max(2, len(embeddings) // 20)  # Adaptive min cluster size
            clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, 
                                      min_samples=1, cluster_selection_epsilon=0.5)
            cluster_labels = clusterer.fit_predict(scaled_embeddings)
        elif clustering_method == "Spectral Clustering":
            clusterer = SpectralClustering(n_clusters=n_clusters, random_state=DEFAULT_RANDOM_STATE,
                                         affinity='rbf', gamma=1.0)
            cluster_labels = clusterer.fit_predict(scaled_embeddings)
        elif clustering_method == "Gaussian Mixture":
            clusterer = GaussianMixture(n_components=n_clusters, random_state=DEFAULT_RANDOM_STATE,
                                      covariance_type='full', max_iter=200)
            cluster_labels = clusterer.fit_predict(scaled_embeddings)
        elif clustering_method == "Agglomerative (Ward)":
            clusterer = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
            cluster_labels = clusterer.fit_predict(scaled_embeddings)
        elif clustering_method == "Agglomerative (Complete)":
            clusterer = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete')
            cluster_labels = clusterer.fit_predict(scaled_embeddings)
        elif clustering_method == "OPTICS":
            min_samples = max(2, len(embeddings) // 50)
            clusterer = OPTICS(min_samples=min_samples, xi=0.05, min_cluster_size=0.1)
            cluster_labels = clusterer.fit_predict(scaled_embeddings)
        # Calculate clustering quality metrics
        if cluster_labels is not None and len(np.unique(cluster_labels)) > 1:
            # Only calculate if we have multiple clusters and no noise-only clustering
            valid_labels = cluster_labels[cluster_labels != -1]  # Remove noise points for HDBSCAN/OPTICS
            valid_embeddings = scaled_embeddings[cluster_labels != -1]
            if len(valid_labels) > 0 and len(np.unique(valid_labels)) > 1:
                silhouette_avg = silhouette_score(valid_embeddings, valid_labels)
                calinski_harabasz = calinski_harabasz_score(valid_embeddings, valid_labels)
    except Exception as e:
        st.warning(f"Clustering failed: {str(e)}")
        cluster_labels = None
    return cluster_labels, silhouette_avg, calinski_harabasz
 def get_cluster_statistics(cluster_labels):
    """Get basic statistics about clustering results"""
    if cluster_labels is None:
        return {}
    unique_clusters = np.unique(cluster_labels)
    n_clusters = len(unique_clusters[unique_clusters != -1])  # Exclude noise cluster (-1)
    n_noise = np.sum(cluster_labels == -1)
    return {
        "n_clusters": n_clusters,
        "n_noise_points": n_noise,
        "cluster_distribution": np.bincount(cluster_labels[cluster_labels != -1]) if n_clusters > 0 else [],
        "unique_clusters": unique_clusters
    }
--- a/apps/cluster_map/config.py
+++ b/apps/cluster_map/config.py
@@ -0,0 +1,73 @@
 """
 Configuration settings and constants for the Discord Chat Embeddings Visualizer.
 """
 # Application settings
 APP_TITLE = "Discord Chat Embeddings Visualizer"
 APP_ICON = "🗨️"
 APP_LAYOUT = "wide"
 # File paths
 CHAT_LOGS_PATH = "../../discord_chat_logs"
 # Algorithm parameters
 DEFAULT_RANDOM_STATE = 42
 DEFAULT_N_COMPONENTS = 2
 DEFAULT_N_CLUSTERS = 5
 # Visualization settings
 DEFAULT_POINT_SIZE = 8
 DEFAULT_POINT_OPACITY = 0.7
 MAX_DISPLAYED_AUTHORS = 10
 MESSAGE_CONTENT_PREVIEW_LENGTH = 200
 MESSAGE_CONTENT_DISPLAY_LENGTH = 100
 # Performance thresholds
 LARGE_DATASET_WARNING_THRESHOLD = 1000
 # Color palettes
 PRIMARY_COLORS = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", 
                  "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
 # Clustering method categories
 CLUSTERING_METHODS_REQUIRING_N_CLUSTERS = [
    "Spectral Clustering", 
    "Gaussian Mixture", 
    "Agglomerative (Ward)", 
    "Agglomerative (Complete)"
 ]
 COMPUTATIONALLY_INTENSIVE_METHODS = {
    "dimension_reduction": ["t-SNE", "Spectral Embedding"],
    "clustering": ["Spectral Clustering", "OPTICS"]
 }
 # Method explanations
 METHOD_EXPLANATIONS = {
    "dimension_reduction": {
        "PCA": "Linear, fast, preserves global variance",
        "t-SNE": "Non-linear, good for local structure, slower",
        "UMAP": "Balanced speed/quality, preserves local & global structure",
        "Spectral Embedding": "Uses graph theory, good for non-convex clusters",
        "Force-Directed": "Physics-based layout, creates natural spacing"
    },
    "clustering": {
        "HDBSCAN": "Density-based, finds variable density clusters, handles noise",
        "Spectral Clustering": "Uses eigenvalues, good for non-convex shapes",
        "Gaussian Mixture": "Probabilistic, assumes gaussian distributions",
        "Agglomerative (Ward)": "Hierarchical, minimizes within-cluster variance",
        "Agglomerative (Complete)": "Hierarchical, minimizes maximum distance",
        "OPTICS": "Density-based, finds clusters of varying densities"
    },
    "separation": {
        "Spread Factor": "Applies repulsive forces between nearby points",
        "Smart Jittering": "Adds intelligent noise to separate overlapping points",
        "Density-Based Jittering": "Stronger separation in crowded areas",
        "Perplexity Factor": "Controls t-SNE's focus on local vs global structure",
        "Min Distance Factor": "Controls UMAP's point packing tightness"
    },
    "metrics": {
        "Silhouette Score": "Higher is better (range: -1 to 1)",
        "Calinski-Harabasz": "Higher is better, measures cluster separation"
    }
 }
--- a/apps/cluster_map/data_loader.py
+++ b/apps/cluster_map/data_loader.py
@@ -0,0 +1,86 @@
 """
 Data loading and parsing utilities for Discord chat logs.
 """
 import pandas as pd
 import numpy as np
 import streamlit as st
 import ast
 from pathlib import Path
 from config import CHAT_LOGS_PATH
@st.cache_data
 def load_all_chat_data():
    """Load all CSV files from the discord_chat_logs folder"""
    chat_logs_path = Path(CHAT_LOGS_PATH)
    with st.expander("📁 Loading Details", expanded=False):
        # Display the path for debugging
        st.write(f"Looking for CSV files in: {chat_logs_path}")
        st.write(f"Path exists: {chat_logs_path.exists()}")
        all_data = []
        for csv_file in chat_logs_path.glob("*.csv"):
            try:
                df = pd.read_csv(csv_file)
                df['source_file'] = csv_file.stem  # Add source file name
                all_data.append(df)
                st.write(f"✅ Loaded {len(df)} messages from {csv_file.name}")
            except Exception as e:
                st.error(f"❌ Error loading {csv_file.name}: {e}")
        if all_data:
            combined_df = pd.concat(all_data, ignore_index=True)
            st.success(f"🎉 Successfully loaded {len(combined_df)} total messages from {len(all_data)} files")
        else:
            st.error("No data loaded!")
            combined_df = pd.DataFrame()
    return combined_df if all_data else pd.DataFrame()
@st.cache_data
 def parse_embeddings(df):
    """Parse the content_embedding column from string to numpy array"""
    embeddings = []
    valid_indices = []
    for idx, embedding_str in enumerate(df['content_embedding']):
        try:
            # Parse the string representation of the list
            embedding = ast.literal_eval(embedding_str)
            if isinstance(embedding, list) and len(embedding) > 0:
                embeddings.append(embedding)
                valid_indices.append(idx)
        except Exception as e:
            continue
    embeddings_array = np.array(embeddings)
    valid_df = df.iloc[valid_indices].copy()
    st.info(f"📊 Parsed {len(embeddings)} valid embeddings from {len(df)} messages")
    st.info(f"🔢 Embedding dimension: {embeddings_array.shape[1] if len(embeddings) > 0 else 0}")
    return embeddings_array, valid_df
 def filter_data(df, selected_sources, selected_authors):
    """Filter dataframe by selected sources and authors"""
    if not selected_sources:
        selected_sources = df['source_file'].unique()
    filtered_df = df[
        (df['source_file'].isin(selected_sources)) &
        (df['author_name'].isin(selected_authors))
    ]
    return filtered_df
 def get_filtered_embeddings(embeddings, valid_df, filtered_df):
    """Get embeddings corresponding to filtered dataframe"""
    filtered_indices = filtered_df.index.tolist()
    filtered_embeddings = embeddings[[i for i, idx in enumerate(valid_df.index) if idx in filtered_indices]]
    return filtered_embeddings
--- a/apps/cluster_map/dimensionality_reduction.py
+++ b/apps/cluster_map/dimensionality_reduction.py
@@ -0,0 +1,211 @@
 """
 Dimensionality reduction algorithms and point separation techniques.
 """
 import numpy as np
 import streamlit as st
 from sklearn.decomposition import PCA
 from sklearn.manifold import TSNE, SpectralEmbedding
 from sklearn.preprocessing import StandardScaler
 from sklearn.neighbors import NearestNeighbors
 from scipy.spatial.distance import pdist, squareform
 from scipy.optimize import minimize
 import umap
 from config import DEFAULT_RANDOM_STATE
 def apply_adaptive_spreading(embeddings, spread_factor=1.0):
    """
    Apply adaptive spreading to push apart nearby points while preserving global structure.
    Uses a force-based approach where closer points repel more strongly.
    """
    if spread_factor <= 0:
        return embeddings
    embeddings = embeddings.copy()
    n_points = len(embeddings)
    print(f"DEBUG: Applying adaptive spreading to {n_points} points with factor {spread_factor}")
    if n_points < 2:
        return embeddings
    # For very large datasets, skip spreading to avoid hanging
    if n_points > 1000:
        print(f"DEBUG: Large dataset ({n_points} points), skipping adaptive spreading...")
        return embeddings
    # Calculate pairwise distances
    distances = squareform(pdist(embeddings))
    # Apply force-based spreading with fewer iterations for large datasets
    max_iterations = 3 if n_points > 500 else 5
    for iteration in range(max_iterations):
        if iteration % 2 == 0:  # Progress indicator
            print(f"DEBUG: Spreading iteration {iteration + 1}/{max_iterations}")
        forces = np.zeros_like(embeddings)
        for i in range(n_points):
            for j in range(i + 1, n_points):
                diff = embeddings[i] - embeddings[j]
                dist = np.linalg.norm(diff)
                if dist > 0:
                    # Repulsive force inversely proportional to distance
                    force_magnitude = spread_factor / (dist ** 2 + 0.01)
                    force_direction = diff / dist
                    force = force_magnitude * force_direction
                    forces[i] += force
                    forces[j] -= force
        # Apply forces with damping
        embeddings += forces * 0.1
    print(f"DEBUG: Adaptive spreading complete")
    return embeddings
 def force_directed_layout(high_dim_embeddings, n_components=2, spread_factor=1.0):
    """
    Create a force-directed layout from high-dimensional embeddings.
    This creates more natural spacing between similar points.
    """
    print(f"DEBUG: Starting force-directed layout with {len(high_dim_embeddings)} points...")
    # For large datasets, fall back to PCA + spreading to avoid hanging
    if len(high_dim_embeddings) > 500:
        print(f"DEBUG: Large dataset ({len(high_dim_embeddings)} points), using PCA + spreading instead...")
        pca = PCA(n_components=n_components, random_state=DEFAULT_RANDOM_STATE)
        result = pca.fit_transform(high_dim_embeddings)
        return apply_adaptive_spreading(result, spread_factor)
    # Start with PCA as initial layout
    pca = PCA(n_components=n_components, random_state=DEFAULT_RANDOM_STATE)
    initial_layout = pca.fit_transform(high_dim_embeddings)
    print(f"DEBUG: Initial PCA layout computed...")
    # For simplicity, just apply spreading to the PCA result
    # The original optimization was too computationally intensive
    result = apply_adaptive_spreading(initial_layout, spread_factor)
    print(f"DEBUG: Force-directed layout complete...")
    return result
 def calculate_local_density_scaling(embeddings, k=5):
    """
    Calculate local density scaling factors to emphasize differences in dense regions.
    """
    if len(embeddings) < k:
        return np.ones(len(embeddings))
    # Find k nearest neighbors for each point
    nn = NearestNeighbors(n_neighbors=k+1)  # +1 because first neighbor is the point itself
    nn.fit(embeddings)
    distances, indices = nn.kneighbors(embeddings)
    # Calculate local density (inverse of average distance to k nearest neighbors)
    local_densities = 1.0 / (np.mean(distances[:, 1:], axis=1) + 1e-6)
    # Normalize densities
    local_densities = (local_densities - np.min(local_densities)) / (np.max(local_densities) - np.min(local_densities) + 1e-6)
    return local_densities
 def apply_density_based_jittering(embeddings, density_scaling=True, jitter_strength=0.1):
    """
    Apply smart jittering that's stronger in dense regions to separate overlapping points.
    """
    if not density_scaling:
        # Simple random jittering
        noise = np.random.normal(0, jitter_strength, embeddings.shape)
        return embeddings + noise
    # Calculate local densities
    densities = calculate_local_density_scaling(embeddings)
    # Apply density-proportional jittering
    jittered = embeddings.copy()
    for i in range(len(embeddings)):
        # More jitter in denser regions
        jitter_amount = jitter_strength * (1 + densities[i])
        noise = np.random.normal(0, jitter_amount, embeddings.shape[1])
        jittered[i] += noise
    return jittered
 def reduce_dimensions(embeddings, method="PCA", n_components=2, spread_factor=1.0, 
                     perplexity_factor=1.0, min_dist_factor=1.0):
    """Apply dimensionality reduction with enhanced separation"""
    # Convert to numpy array if it's not already
    embeddings = np.array(embeddings)
    print(f"DEBUG: Starting {method} with {len(embeddings)} embeddings, shape: {embeddings.shape}")
    # Standardize embeddings for better processing
    scaler = StandardScaler()
    scaled_embeddings = scaler.fit_transform(embeddings)
    print(f"DEBUG: Embeddings standardized")
    # Apply the selected dimensionality reduction method
    if method == "PCA":
        print(f"DEBUG: Applying PCA...")
        reducer = PCA(n_components=n_components, random_state=DEFAULT_RANDOM_STATE)
        reduced_embeddings = reducer.fit_transform(scaled_embeddings)
        # Apply spreading to PCA results
        print(f"DEBUG: Applying spreading...")
        reduced_embeddings = apply_adaptive_spreading(reduced_embeddings, spread_factor)
    elif method == "t-SNE":
        # Adjust perplexity based on user preference and data size
        base_perplexity = min(30, len(embeddings)-1)
        adjusted_perplexity = max(5, min(50, int(base_perplexity * perplexity_factor)))
        print(f"DEBUG: Applying t-SNE with perplexity {adjusted_perplexity}...")
        reducer = TSNE(n_components=n_components, random_state=DEFAULT_RANDOM_STATE, 
                      perplexity=adjusted_perplexity, n_iter=1000,
                      early_exaggeration=12.0 * spread_factor,  # Increase early exaggeration for more separation
                      learning_rate='auto')
        reduced_embeddings = reducer.fit_transform(scaled_embeddings)
    elif method == "UMAP":
        # Adjust UMAP parameters for better local separation
        n_neighbors = min(15, len(embeddings)-1)
        min_dist = 0.1 * min_dist_factor
        spread = 1.0 * spread_factor
        print(f"DEBUG: Applying UMAP with n_neighbors={n_neighbors}, min_dist={min_dist}...")
        reducer = umap.UMAP(n_components=n_components, random_state=DEFAULT_RANDOM_STATE, 
                           n_neighbors=n_neighbors, min_dist=min_dist,
                           spread=spread, local_connectivity=2.0)
        reduced_embeddings = reducer.fit_transform(scaled_embeddings)
    elif method == "Spectral Embedding":
        n_neighbors = min(10, len(embeddings)-1)
        print(f"DEBUG: Applying Spectral Embedding with n_neighbors={n_neighbors}...")
        reducer = SpectralEmbedding(n_components=n_components, random_state=DEFAULT_RANDOM_STATE,
                                   n_neighbors=n_neighbors)
        reduced_embeddings = reducer.fit_transform(scaled_embeddings)
        # Apply spreading to spectral results
        print(f"DEBUG: Applying spreading...")
        reduced_embeddings = apply_adaptive_spreading(reduced_embeddings, spread_factor)
    elif method == "Force-Directed":
        # New method: Use force-directed layout for natural spreading
        print(f"DEBUG: Applying Force-Directed layout...")
        reduced_embeddings = force_directed_layout(scaled_embeddings, n_components, spread_factor)
    else:
        # Fallback to PCA
        print(f"DEBUG: Unknown method {method}, falling back to PCA...")
        reducer = PCA(n_components=n_components, random_state=DEFAULT_RANDOM_STATE)
        reduced_embeddings = reducer.fit_transform(scaled_embeddings)
        reduced_embeddings = apply_adaptive_spreading(reduced_embeddings, spread_factor)
    print(f"DEBUG: Dimensionality reduction complete. Output shape: {reduced_embeddings.shape}")
    return reduced_embeddings
--- a/apps/cluster_map/main.py
+++ b/apps/cluster_map/main.py
@@ -0,0 +1,132 @@
 """
 Main application logic for the Discord Chat Embeddings Visualizer.
 """
 import streamlit as st
 import warnings
 warnings.filterwarnings('ignore')
 # Import custom modules
 from ui_components import (
    setup_page_config, display_title_and_description, get_all_ui_parameters,
    display_performance_warnings
 )
 from data_loader import (
    load_all_chat_data, parse_embeddings, filter_data, get_filtered_embeddings
 )
 from dimensionality_reduction import (
    reduce_dimensions, apply_density_based_jittering
 )
 from clustering import apply_clustering
 from visualization import (
    create_visualization_plot, display_clustering_metrics, display_summary_stats,
    display_clustering_results, display_data_table
 )
 def main():
    """Main application function"""
    # Set up page configuration
    setup_page_config()
    # Display title and description
    display_title_and_description()
    # Load data
    with st.spinner("Loading chat data..."):
        df = load_all_chat_data()
    if df.empty:
        st.error("No data could be loaded. Please check the data directory.")
        st.stop()
    # Parse embeddings
    with st.spinner("Parsing embeddings..."):
        embeddings, valid_df = parse_embeddings(df)
    if len(embeddings) == 0:
        st.error("No valid embeddings found!")
        st.stop()
    # Get UI parameters
    params = get_all_ui_parameters(valid_df)
    # Filter data
    filtered_df = filter_data(valid_df, params['selected_sources'], params['selected_authors'])
    if filtered_df.empty:
        st.warning("No data matches the current filters!")
        st.stop()
    # Display performance warnings
    display_performance_warnings(filtered_df, params['method'], params['clustering_method'])
    # Get corresponding embeddings
    filtered_embeddings = get_filtered_embeddings(embeddings, valid_df, filtered_df)
    st.info(f"📈 Visualizing {len(filtered_df)} messages")
    # Reduce dimensions
    with st.spinner(f"Reducing dimensions using {params['method']}..."):
        reduced_embeddings = reduce_dimensions(
            filtered_embeddings, 
            method=params['method'],
            spread_factor=params['spread_factor'],
            perplexity_factor=params['perplexity_factor'],
            min_dist_factor=params['min_dist_factor']
        )
    # Apply clustering
    with st.spinner(f"Applying {params['clustering_method']}..."):
        cluster_labels, silhouette_avg, calinski_harabasz = apply_clustering(
            filtered_embeddings,
            clustering_method=params['clustering_method'],
            n_clusters=params['n_clusters']
        )
    # Apply jittering if requested
    if params['apply_jittering']:
        with st.spinner("Applying smart jittering to separate overlapping points..."):
            reduced_embeddings = apply_density_based_jittering(
                reduced_embeddings, 
                density_scaling=params['density_based_jitter'], 
                jitter_strength=params['jitter_strength']
            )
    # Display clustering metrics
    display_clustering_metrics(
        cluster_labels, silhouette_avg, calinski_harabasz, 
        params['show_cluster_metrics']
    )
    # Create and display the main plot
    fig = create_visualization_plot(
        reduced_embeddings=reduced_embeddings,
        filtered_df=filtered_df,
        cluster_labels=cluster_labels,
        selected_sources=params['selected_sources'] if params['selected_sources'] else None,
        method=params['method'],
        clustering_method=params['clustering_method'],
        point_size=params['point_size'],
        point_opacity=params['point_opacity'],
        density_based_sizing=params['density_based_sizing'],
        size_variation=params['size_variation']
    )
    st.plotly_chart(fig, use_container_width=True)
    # Display summary statistics
    display_summary_stats(filtered_df, params['selected_sources'] or filtered_df['source_file'].unique())
    # Display clustering results and export options
    display_clustering_results(
        filtered_df, cluster_labels, reduced_embeddings, 
        params['method'], params['clustering_method']
    )
    # Display data table
    display_data_table(filtered_df, cluster_labels)
 if __name__ == "__main__":
    main()
--- a/apps/cluster_map/requirements.txt
+++ b/apps/cluster_map/requirements.txt
@@ -3,3 +3,6 @@ pandas>=1.5.0
 numpy>=1.24.0
 plotly>=5.15.0
 scikit-learn>=1.3.0
 umap-learn>=0.5.3
 hdbscan>=0.8.29
 scipy>=1.10.0
--- a/apps/cluster_map/streamlit_app.py
+++ b/apps/cluster_map/streamlit_app.py
@@ -1,233 +0,0 @@
 import streamlit as st
 import pandas as pd
 import numpy as np
 import plotly.express as px
 import plotly.graph_objects as go
 from sklearn.decomposition import PCA
 from sklearn.manifold import TSNE
 import json
 import os
 from pathlib import Path
 import ast
 # Set page config
 st.set_page_config(
    page_title="Discord Chat Embeddings Visualizer",
    page_icon="🗨️",
    layout="wide"
 )
 # Title and description
 st.title("🗨️ Discord Chat Embeddings Visualizer")
 st.markdown("Explore Discord chat messages through their vector embeddings in 2D space")
@st.cache_data
 def load_all_chat_data():
    """Load all CSV files from the discord_chat_logs folder"""
    chat_logs_path = Path("../../discord_chat_logs")
    # Display the path for debugging
    st.write(f"Looking for CSV files in: {chat_logs_path}")
    st.write(f"Path exists: {chat_logs_path.exists()}")
    all_data = []
    for csv_file in chat_logs_path.glob("*.csv"):
        try:
            df = pd.read_csv(csv_file)
            df['source_file'] = csv_file.stem  # Add source file name
            all_data.append(df)
            st.write(f"✅ Loaded {len(df)} messages from {csv_file.name}")
        except Exception as e:
            st.error(f"❌ Error loading {csv_file.name}: {e}")
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        st.success(f"🎉 Successfully loaded {len(combined_df)} total messages from {len(all_data)} files")
        return combined_df
    else:
        st.error("No data loaded!")
        return pd.DataFrame()
@st.cache_data
 def parse_embeddings(df):
    """Parse the content_embedding column from string to numpy array"""
    embeddings = []
    valid_indices = []
    for idx, embedding_str in enumerate(df['content_embedding']):
        try:
            # Parse the string representation of the list
            embedding = ast.literal_eval(embedding_str)
            if isinstance(embedding, list) and len(embedding) > 0:
                embeddings.append(embedding)
                valid_indices.append(idx)
        except Exception as e:
            continue
    embeddings_array = np.array(embeddings)
    valid_df = df.iloc[valid_indices].copy()
    st.info(f"📊 Parsed {len(embeddings)} valid embeddings from {len(df)} messages")
    st.info(f"🔢 Embedding dimension: {embeddings_array.shape[1] if len(embeddings) > 0 else 0}")
    return embeddings_array, valid_df
@st.cache_data
 def reduce_dimensions(embeddings, method="PCA", n_components=2):
    """Reduce embeddings to 2D using PCA or t-SNE"""
    if method == "PCA":
        reducer = PCA(n_components=n_components, random_state=42)
    elif method == "t-SNE":
        reducer = TSNE(n_components=n_components, random_state=42, perplexity=min(30, len(embeddings)-1))
    reduced_embeddings = reducer.fit_transform(embeddings)
    return reduced_embeddings
 def create_hover_text(df):
    """Create hover text for plotly"""
    hover_text = []
    for _, row in df.iterrows():
        text = f"<b>Author:</b> {row['author_name']}<br>"
        text += f"<b>Timestamp:</b> {row['timestamp_utc']}<br>"
        text += f"<b>Source:</b> {row['source_file']}<br>"
        # Handle potential NaN or non-string content
        content = row['content']
        if pd.isna(content) or content is None:
            content_text = "[No content]"
        else:
            content_str = str(content)
            content_text = content_str[:200] + ('...' if len(content_str) > 200 else '')
        text += f"<b>Content:</b> {content_text}"
        hover_text.append(text)
    return hover_text
 def main():
    # Load data
    with st.spinner("Loading chat data..."):
        df = load_all_chat_data()
    if df.empty:
        st.stop()
    # Parse embeddings
    with st.spinner("Parsing embeddings..."):
        embeddings, valid_df = parse_embeddings(df)
    if len(embeddings) == 0:
        st.error("No valid embeddings found!")
        st.stop()
    # Sidebar controls
    st.sidebar.header("🎛️ Visualization Controls")
    # Dimension reduction method
    method = st.sidebar.selectbox(
        "Dimension Reduction Method",
        ["PCA", "t-SNE"],
        help="PCA is faster, t-SNE may reveal better clusters"
    )
    # Source file filter
    source_files = valid_df['source_file'].unique()
    selected_sources = st.sidebar.multiselect(
        "Filter by Source Files",
        source_files,
        default=source_files,
        help="Select which chat log files to include"
    )
    # Author filter
    authors = valid_df['author_name'].unique()
    selected_authors = st.sidebar.multiselect(
        "Filter by Authors",
        authors,
        default=authors[:10] if len(authors) > 10 else authors,  # Limit to first 10 for performance
        help="Select which authors to include"
    )
    # Filter data
    filtered_df = valid_df[
        (valid_df['source_file'].isin(selected_sources)) &
        (valid_df['author_name'].isin(selected_authors))
    ]
    if filtered_df.empty:
        st.warning("No data matches the current filters!")
        st.stop()
    # Get corresponding embeddings
    filtered_indices = filtered_df.index.tolist()
    filtered_embeddings = embeddings[[i for i, idx in enumerate(valid_df.index) if idx in filtered_indices]]
    st.info(f"📈 Visualizing {len(filtered_df)} messages")
    # Reduce dimensions
    with st.spinner(f"Reducing dimensions using {method}..."):
        reduced_embeddings = reduce_dimensions(filtered_embeddings, method)
    # Create hover text
    hover_text = create_hover_text(filtered_df)
    # Create the plot
    fig = go.Figure()
    # Color by source file
    colors = px.colors.qualitative.Set1
    for i, source in enumerate(selected_sources):
        source_mask = filtered_df['source_file'] == source
        if source_mask.any():
            source_data = filtered_df[source_mask]
            source_embeddings = reduced_embeddings[source_mask]
            source_hover = [hover_text[j] for j, mask in enumerate(source_mask) if mask]
            fig.add_trace(go.Scatter(
                x=source_embeddings[:, 0],
                y=source_embeddings[:, 1],
                mode='markers',
                name=source,
                marker=dict(
                    size=8,
                    color=colors[i % len(colors)],
                    opacity=0.7,
                    line=dict(width=1, color='white')
                ),
                hovertemplate='%{hovertext}<extra></extra>',
                hovertext=source_hover
            ))
    fig.update_layout(
        title=f"Discord Chat Messages - {method} Visualization",
        xaxis_title=f"{method} Component 1",
        yaxis_title=f"{method} Component 2",
        hovermode='closest',
        width=1000,
        height=700
    )
    # Display the plot
    st.plotly_chart(fig, use_container_width=True)
    # Statistics
    col1, col2, col3 = st.columns(3)
    with col1:
        st.metric("Total Messages", len(filtered_df))
    with col2:
        st.metric("Unique Authors", filtered_df['author_name'].nunique())
    with col3:
        st.metric("Source Files", len(selected_sources))
    # Show data table
    if st.checkbox("Show Data Table"):
        st.subheader("📋 Message Data")
        display_df = filtered_df[['timestamp_utc', 'author_name', 'source_file', 'content']].copy()
        display_df['content'] = display_df['content'].str[:100] + '...'  # Truncate for display
        st.dataframe(display_df, use_container_width=True)
 if __name__ == "__main__":
    main()
--- a/apps/cluster_map/test_debug.py
+++ b/apps/cluster_map/test_debug.py
@@ -0,0 +1,43 @@
 #!/usr/bin/env python3
 """
 Test script to debug the hanging issue in the modular app
 """
 import numpy as np
 import sys
 import os
 # Add the current directory to Python path
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 def test_dimensionality_reduction():
    """Test dimensionality reduction functions"""
    print("Testing dimensionality reduction functions...")
    from dimensionality_reduction import reduce_dimensions
    # Create test data similar to what we'd expect
    n_samples = 796  # Same as the user's dataset
    n_features = 384  # Common embedding dimension
    print(f"Creating test embeddings: {n_samples} x {n_features}")
    test_embeddings = np.random.randn(n_samples, n_features)
    # Test PCA (should be fast)
    print("Testing PCA...")
    try:
        result = reduce_dimensions(test_embeddings, method="PCA")
        print(f"✓ PCA successful, output shape: {result.shape}")
    except Exception as e:
        print(f"✗ PCA failed: {e}")
    # Test UMAP (might be slower)
    print("Testing UMAP...")
    try:
        result = reduce_dimensions(test_embeddings, method="UMAP")
        print(f"✓ UMAP successful, output shape: {result.shape}")
    except Exception as e:
        print(f"✗ UMAP failed: {e}")
 if __name__ == "__main__":
    test_dimensionality_reduction()
--- a/apps/cluster_map/ui_components.py
+++ b/apps/cluster_map/ui_components.py
@@ -0,0 +1,236 @@
 """
 Streamlit UI components and controls for the Discord Chat Embeddings Visualizer.
 """
 import streamlit as st
 import numpy as np
 from config import (
    APP_TITLE, APP_ICON, APP_LAYOUT, METHOD_EXPLANATIONS,
    CLUSTERING_METHODS_REQUIRING_N_CLUSTERS, COMPUTATIONALLY_INTENSIVE_METHODS,
    LARGE_DATASET_WARNING_THRESHOLD, MAX_DISPLAYED_AUTHORS
 )
 def setup_page_config():
    """Set up the Streamlit page configuration"""
    st.set_page_config(
        page_title=APP_TITLE,
        page_icon=APP_ICON,
        layout=APP_LAYOUT
    )
 def display_title_and_description():
    """Display the main title and description"""
    st.title(f"{APP_ICON} {APP_TITLE}")
    st.markdown("Explore Discord chat messages through their vector embeddings in 2D space")
 def create_method_controls():
    """Create controls for dimension reduction and clustering methods"""
    st.sidebar.header("🎛️ Visualization Controls")
    # Dimension reduction method
    method = st.sidebar.selectbox(
        "Dimension Reduction Method",
        ["PCA", "t-SNE", "UMAP", "Spectral Embedding", "Force-Directed"],
        help="PCA is fastest, UMAP balances speed and quality, t-SNE and Spectral are slower but may reveal better structures. Force-Directed creates natural spacing."
    )
    # Clustering method
    clustering_method = st.sidebar.selectbox(
        "Clustering Method",
        ["None", "HDBSCAN", "Spectral Clustering", "Gaussian Mixture", 
         "Agglomerative (Ward)", "Agglomerative (Complete)", "OPTICS"],
        help="Apply clustering to identify groups. HDBSCAN and OPTICS can find variable density clusters."
    )
    return method, clustering_method
 def create_clustering_controls(clustering_method):
    """Create controls for clustering parameters"""
    n_clusters = 5
    if clustering_method in CLUSTERING_METHODS_REQUIRING_N_CLUSTERS:
        n_clusters = st.sidebar.slider("Number of Clusters", 2, 15, 5)
    return n_clusters
 def create_separation_controls(method):
    """Create controls for point separation and method-specific parameters"""
    st.sidebar.subheader("🎯 Point Separation Controls")
    spread_factor = st.sidebar.slider(
        "Spread Factor", 
        0.5, 3.0, 1.0, 0.1,
        help="Increase to spread apart nearby points. Higher values create more separation."
    )
    # Method-specific parameters
    perplexity_factor = 1.0
    min_dist_factor = 1.0
    if method == "t-SNE":
        perplexity_factor = st.sidebar.slider(
            "Perplexity Factor", 
            0.5, 2.0, 1.0, 0.1,
            help="Affects local vs global structure balance. Lower values focus on local details."
        )
    if method == "UMAP":
        min_dist_factor = st.sidebar.slider(
            "Min Distance Factor", 
            0.1, 2.0, 1.0, 0.1,
            help="Controls how tightly points are packed. Lower values create tighter clusters."
        )
    return spread_factor, perplexity_factor, min_dist_factor
 def create_jittering_controls():
    """Create controls for jittering options"""
    apply_jittering = st.sidebar.checkbox(
        "Apply Smart Jittering", 
        value=False,
        help="Add intelligent noise to separate overlapping points"
    )
    jitter_strength = 0.1
    density_based_jitter = True
    if apply_jittering:
        jitter_strength = st.sidebar.slider(
            "Jitter Strength", 
            0.01, 0.5, 0.1, 0.01,
            help="Strength of jittering. Higher values spread points more."
        )
        density_based_jitter = st.sidebar.checkbox(
            "Density-Based Jittering", 
            value=True,
            help="Apply stronger jittering in dense regions"
        )
    return apply_jittering, jitter_strength, density_based_jitter
 def create_advanced_options():
    """Create advanced visualization options"""
    with st.sidebar.expander("⚙️ Advanced Options"):
        show_cluster_metrics = st.checkbox("Show Clustering Metrics", value=True)
        point_size = st.slider("Point Size", 4, 15, 8)
        point_opacity = st.slider("Point Opacity", 0.3, 1.0, 0.7)
        # Density-based visualization
        density_based_sizing = st.checkbox(
            "Density-Based Point Sizing", 
            value=False,
            help="Make points larger in sparse regions, smaller in dense regions"
        )
        size_variation = 2.0
        if density_based_sizing:
            size_variation = st.slider(
                "Size Variation Factor", 
                1.5, 4.0, 2.0, 0.1,
                help="How much point sizes vary based on local density"
            )
    return show_cluster_metrics, point_size, point_opacity, density_based_sizing, size_variation
 def create_filter_controls(valid_df):
    """Create controls for filtering data by source and author"""
    # Source file filter
    source_files = valid_df['source_file'].unique()
    selected_sources = st.sidebar.multiselect(
        "Filter by Source Files",
        source_files,
        default=[],
        help="Select which chat log files to include"
    )
    # Author filter
    authors = valid_df['author_name'].unique()
    default_authors = authors[:MAX_DISPLAYED_AUTHORS] if len(authors) > MAX_DISPLAYED_AUTHORS else authors
    selected_authors = st.sidebar.multiselect(
        "Filter by Authors",
        authors,
        default=default_authors,
        help="Select which authors to include"
    )
    return selected_sources, selected_authors
 def display_method_explanations():
    """Display explanations for different methods"""
    st.sidebar.markdown("---")
    with st.sidebar.expander("📚 Method Explanations"):
        st.markdown("**Dimensionality Reduction:**")
        for method, explanation in METHOD_EXPLANATIONS["dimension_reduction"].items():
            st.markdown(f"- **{method}**: {explanation}")
        st.markdown("\n**Clustering Methods:**")
        for method, explanation in METHOD_EXPLANATIONS["clustering"].items():
            st.markdown(f"- **{method}**: {explanation}")
        st.markdown("\n**Separation Techniques:**")
        for technique, explanation in METHOD_EXPLANATIONS["separation"].items():
            st.markdown(f"- **{technique}**: {explanation}")
        st.markdown("\n**Metrics:**")
        for metric, explanation in METHOD_EXPLANATIONS["metrics"].items():
            st.markdown(f"- **{metric}**: {explanation}")
 def display_performance_warnings(filtered_df, method, clustering_method):
    """Display performance warnings for computationally intensive operations"""
    if len(filtered_df) > LARGE_DATASET_WARNING_THRESHOLD:
        if method in COMPUTATIONALLY_INTENSIVE_METHODS["dimension_reduction"]:
            st.warning(f"⚠️ {method} with {len(filtered_df)} points may take several minutes to compute.")
        if clustering_method in COMPUTATIONALLY_INTENSIVE_METHODS["clustering"]:
            st.warning(f"⚠️ {clustering_method} with {len(filtered_df)} points may be computationally intensive.")
 def get_all_ui_parameters(valid_df):
    """Get all UI parameters in a single function call"""
    # Method selection
    method, clustering_method = create_method_controls()
    # Clustering parameters
    n_clusters = create_clustering_controls(clustering_method)
    # Separation controls
    spread_factor, perplexity_factor, min_dist_factor = create_separation_controls(method)
    # Jittering controls
    apply_jittering, jitter_strength, density_based_jitter = create_jittering_controls()
    # Advanced options
    show_cluster_metrics, point_size, point_opacity, density_based_sizing, size_variation = create_advanced_options()
    # Filters
    selected_sources, selected_authors = create_filter_controls(valid_df)
    # Method explanations
    display_method_explanations()
    return {
        'method': method,
        'clustering_method': clustering_method,
        'n_clusters': n_clusters,
        'spread_factor': spread_factor,
        'perplexity_factor': perplexity_factor,
        'min_dist_factor': min_dist_factor,
        'apply_jittering': apply_jittering,
        'jitter_strength': jitter_strength,
        'density_based_jitter': density_based_jitter,
        'show_cluster_metrics': show_cluster_metrics,
        'point_size': point_size,
        'point_opacity': point_opacity,
        'density_based_sizing': density_based_sizing,
        'size_variation': size_variation,
        'selected_sources': selected_sources,
        'selected_authors': selected_authors
    }
--- a/apps/cluster_map/visualization.py
+++ b/apps/cluster_map/visualization.py
@@ -0,0 +1,225 @@
 """
 Visualization functions for creating interactive plots and displays.
 """
 import pandas as pd
 import numpy as np
 import plotly.express as px
 import plotly.graph_objects as go
 import streamlit as st
 from dimensionality_reduction import calculate_local_density_scaling
 from config import MESSAGE_CONTENT_PREVIEW_LENGTH, DEFAULT_POINT_SIZE, DEFAULT_POINT_OPACITY
 def create_hover_text(df):
    """Create hover text for plotly"""
    hover_text = []
    for _, row in df.iterrows():
        text = f"<b>Author:</b> {row['author_name']}<br>"
        text += f"<b>Timestamp:</b> {row['timestamp_utc']}<br>"
        text += f"<b>Source:</b> {row['source_file']}<br>"
        # Handle potential NaN or non-string content
        content = row['content']
        if pd.isna(content) or content is None:
            content_text = "[No content]"
        else:
            content_str = str(content)
            content_text = content_str[:MESSAGE_CONTENT_PREVIEW_LENGTH] + ('...' if len(content_str) > MESSAGE_CONTENT_PREVIEW_LENGTH else '')
        text += f"<b>Content:</b> {content_text}"
        hover_text.append(text)
    return hover_text
 def calculate_point_sizes(reduced_embeddings, density_based_sizing=False, 
                         point_size=DEFAULT_POINT_SIZE, size_variation=2.0):
    """Calculate point sizes based on density if enabled"""
    if not density_based_sizing:
        return [point_size] * len(reduced_embeddings)
    local_densities = calculate_local_density_scaling(reduced_embeddings)
    # Invert densities so sparse areas get larger points
    inverted_densities = 1.0 - local_densities
    # Scale point sizes
    point_sizes = point_size * (1.0 + inverted_densities * (size_variation - 1.0))
    return point_sizes
 def create_clustered_plot(reduced_embeddings, filtered_df, cluster_labels, hover_text, 
                         point_sizes, point_opacity=DEFAULT_POINT_OPACITY, method="PCA"):
    """Create a plot colored by clusters"""
    fig = go.Figure()
    unique_clusters = np.unique(cluster_labels)
    colors = px.colors.qualitative.Set3 + px.colors.qualitative.Pastel
    for i, cluster_id in enumerate(unique_clusters):
        cluster_mask = cluster_labels == cluster_id
        if cluster_mask.any():
            cluster_embeddings = reduced_embeddings[cluster_mask]
            cluster_hover = [hover_text[j] for j, mask in enumerate(cluster_mask) if mask]
            cluster_sizes = [point_sizes[j] for j, mask in enumerate(cluster_mask) if mask]
            cluster_name = f"Cluster {cluster_id}" if cluster_id != -1 else "Noise"
            fig.add_trace(go.Scatter(
                x=cluster_embeddings[:, 0],
                y=cluster_embeddings[:, 1],
                mode='markers',
                name=cluster_name,
                marker=dict(
                    size=cluster_sizes,
                    color=colors[i % len(colors)],
                    opacity=point_opacity,
                    line=dict(width=1, color='white')
                ),
                hovertemplate='%{hovertext}<extra></extra>',
                hovertext=cluster_hover
            ))
    return fig
 def create_source_colored_plot(reduced_embeddings, filtered_df, selected_sources, hover_text, 
                              point_sizes, point_opacity=DEFAULT_POINT_OPACITY):
    """Create a plot colored by source files"""
    fig = go.Figure()
    colors = px.colors.qualitative.Set1
    for i, source in enumerate(selected_sources):
        source_mask = filtered_df['source_file'] == source
        if source_mask.any():
            source_embeddings = reduced_embeddings[source_mask]
            source_hover = [hover_text[j] for j, mask in enumerate(source_mask) if mask]
            source_sizes = [point_sizes[j] for j, mask in enumerate(source_mask) if mask]
            fig.add_trace(go.Scatter(
                x=source_embeddings[:, 0],
                y=source_embeddings[:, 1],
                mode='markers',
                name=source,
                marker=dict(
                    size=source_sizes,
                    color=colors[i % len(colors)],
                    opacity=point_opacity,
                    line=dict(width=1, color='white')
                ),
                hovertemplate='%{hovertext}<extra></extra>',
                hovertext=source_hover
            ))
    return fig
 def create_visualization_plot(reduced_embeddings, filtered_df, cluster_labels=None, 
                             selected_sources=None, method="PCA", clustering_method="None",
                             point_size=DEFAULT_POINT_SIZE, point_opacity=DEFAULT_POINT_OPACITY,
                             density_based_sizing=False, size_variation=2.0):
    """Create the main visualization plot"""
    # Create hover text
    hover_text = create_hover_text(filtered_df)
    # Calculate point sizes
    point_sizes = calculate_point_sizes(reduced_embeddings, density_based_sizing, 
                                       point_size, size_variation)
    # Create plot based on coloring strategy
    if cluster_labels is not None:
        fig = create_clustered_plot(reduced_embeddings, filtered_df, cluster_labels, 
                                   hover_text, point_sizes, point_opacity, method)
    else:
        if selected_sources is None:
            selected_sources = filtered_df['source_file'].unique()
        fig = create_source_colored_plot(reduced_embeddings, filtered_df, selected_sources, 
                                        hover_text, point_sizes, point_opacity)
    # Update layout
    title_suffix = f" with {clustering_method}" if clustering_method != "None" else ""
    fig.update_layout(
        title=f"Discord Chat Messages - {method} Visualization{title_suffix}",
        xaxis_title=f"{method} Component 1",
        yaxis_title=f"{method} Component 2",
        hovermode='closest',
        width=1000,
        height=700
    )
    return fig
 def display_clustering_metrics(cluster_labels, silhouette_avg, calinski_harabasz, show_metrics=True):
    """Display clustering quality metrics"""
    if cluster_labels is not None and show_metrics:
        col1, col2, col3 = st.columns(3)
        with col1:
            n_clusters_found = len(np.unique(cluster_labels[cluster_labels != -1]))
            st.metric("Clusters Found", n_clusters_found)
        with col2:
            if silhouette_avg is not None:
                st.metric("Silhouette Score", f"{silhouette_avg:.3f}")
            else:
                st.metric("Silhouette Score", "N/A")
        with col3:
            if calinski_harabasz is not None:
                st.metric("Calinski-Harabasz Index", f"{calinski_harabasz:.1f}")
            else:
                st.metric("Calinski-Harabasz Index", "N/A")
 def display_summary_stats(filtered_df, selected_sources):
    """Display summary statistics"""
    col1, col2, col3 = st.columns(3)
    with col1:
        st.metric("Total Messages", len(filtered_df))
    with col2:
        st.metric("Unique Authors", filtered_df['author_name'].nunique())
    with col3:
        st.metric("Source Files", len(selected_sources))
 def display_clustering_results(filtered_df, cluster_labels, reduced_embeddings, method, clustering_method):
    """Display clustering results and export options"""
    if cluster_labels is None:
        return
    st.subheader("📊 Clustering Results")
    # Add cluster information to dataframe for export
    export_df = filtered_df.copy()
    export_df['cluster_id'] = cluster_labels
    export_df['x_coordinate'] = reduced_embeddings[:, 0]
    export_df['y_coordinate'] = reduced_embeddings[:, 1]
    # Show cluster distribution
    cluster_dist = pd.Series(cluster_labels).value_counts().sort_index()
    st.bar_chart(cluster_dist)
    # Download option
    csv_data = export_df.to_csv(index=False)
    st.download_button(
        label="📥 Download Clustering Results (CSV)",
        data=csv_data,
        file_name=f"chat_clusters_{method}_{clustering_method}.csv",
        mime="text/csv"
    )
 def display_data_table(filtered_df, cluster_labels=None):
    """Display the data table with optional clustering information"""
    if not st.checkbox("Show Data Table"):
        return
    st.subheader("📋 Message Data")
    display_df = filtered_df[['timestamp_utc', 'author_name', 'source_file', 'content']].copy()
    # Add clustering info if available
    if cluster_labels is not None:
        display_df['cluster'] = cluster_labels
    display_df['content'] = display_df['content'].str[:100] + '...'  # Truncate for display
    st.dataframe(display_df, use_container_width=True)
Author	SHA1	Message	Date
Azeem Fidahusein	4ca7e8ab61	refactor	2025-08-11 02:37:21 +01:00
Azeem Fidahusein	6d35b42b27	updated reqs from clusteing	2025-08-11 02:22:59 +01:00