beter clusters and qol

2025-08-11 03:04:50 +01:00
parent 647111e9d3
commit 2b8659fc95
5 changed files with 234 additions and 15 deletions
--- a/apps/cluster_map/clustering.py
+++ b/apps/cluster_map/clustering.py
@@ -9,9 +9,136 @@ from sklearn.mixture import GaussianMixture
 from sklearn.preprocessing import StandardScaler
 from sklearn.metrics import silhouette_score, calinski_harabasz_score
 import hdbscan
+import pandas as pd
+from collections import Counter
+import re
 from config import DEFAULT_RANDOM_STATE


+def summarize_cluster_content(cluster_messages, max_words=3):
+    """
+    Generate a meaningful name for a cluster based on its message content.
+    
+    Args:
+        cluster_messages: List of message contents in the cluster
+        max_words: Maximum number of words in the cluster name
+        
+    Returns:
+        str: Generated cluster name
+    """
+    if not cluster_messages:
+        return "Empty Cluster"
+    
+    # Combine all messages and clean text
+    all_text = " ".join([str(msg) for msg in cluster_messages if pd.notna(msg)])
+    if not all_text.strip():
+        return "Empty Content"
+    
+    # Basic text cleaning
+    text = all_text.lower()
+    
+    # Remove URLs, mentions, and special characters
+    text = re.sub(r'http[s]?://\S+', '', text)  # Remove URLs
+    text = re.sub(r'<@\d+>', '', text)  # Remove Discord mentions
+    text = re.sub(r'<:\w+:\d+>', '', text)  # Remove custom emojis
+    text = re.sub(r'[^\w\s]', ' ', text)  # Remove punctuation
+    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
+    
+    if not text:
+        return "Special Characters"
+    
+    # Split into words and filter out common words
+    words = text.split()
+    
+    # Common stop words to filter out
+    stop_words = {
+        'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
+        'by', 'from', 'up', 'about', 'into', 'through', 'during', 'before', 'after',
+        'above', 'below', 'between', 'among', 'until', 'without', 'under', 'over',
+        'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
+        'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might',
+        'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them',
+        'my', 'your', 'his', 'her', 'its', 'our', 'their', 'this', 'that', 'these', 'those',
+        'just', 'like', 'get', 'know', 'think', 'see', 'go', 'come', 'say', 'said',
+        'yeah', 'yes', 'no', 'oh', 'ok', 'okay', 'well', 'so', 'but', 'if', 'when',
+        'what', 'where', 'why', 'how', 'who', 'which', 'than', 'then', 'now', 'here',
+        'there', 'also', 'too', 'very', 'really', 'pretty', 'much', 'more', 'most',
+        'some', 'any', 'all', 'many', 'few', 'little', 'big', 'small', 'good', 'bad'
+    }
+    
+    # Filter out stop words and very short/long words
+    filtered_words = [
+        word for word in words 
+        if word not in stop_words 
+        and len(word) >= 3 
+        and len(word) <= 15
+        and word.isalpha()  # Only alphabetic words
+    ]
+    
+    if not filtered_words:
+        return f"Chat ({len(cluster_messages)} msgs)"
+    
+    # Count word frequencies
+    word_counts = Counter(filtered_words)
+    
+    # Get most common words
+    most_common = word_counts.most_common(max_words * 2)  # Get more than needed for filtering
+    
+    # Select diverse words (avoid very similar words)
+    selected_words = []
+    for word, count in most_common:
+        # Avoid adding very similar words
+        if not any(word.startswith(existing[:4]) or existing.startswith(word[:4]) 
+                  for existing in selected_words):
+            selected_words.append(word)
+            if len(selected_words) >= max_words:
+                break
+    
+    if not selected_words:
+        return f"Discussion ({len(cluster_messages)} msgs)"
+    
+    # Create cluster name
+    cluster_name = " + ".join(selected_words[:max_words]).title()
+    
+    # Add message count for context
+    cluster_name += f" ({len(cluster_messages)})"
+    
+    return cluster_name
+
+
+def generate_cluster_names(filtered_df, cluster_labels):
+    """
+    Generate names for all clusters based on their content.
+    
+    Args:
+        filtered_df: DataFrame with message data
+        cluster_labels: Array of cluster labels for each message
+        
+    Returns:
+        dict: Mapping from cluster_id to cluster_name
+    """
+    if cluster_labels is None:
+        return {}
+    
+    cluster_names = {}
+    unique_clusters = np.unique(cluster_labels)
+    
+    for cluster_id in unique_clusters:
+        if cluster_id == -1:
+            cluster_names[cluster_id] = "Noise/Outliers"
+            continue
+            
+        # Get messages in this cluster
+        cluster_mask = cluster_labels == cluster_id
+        cluster_messages = filtered_df[cluster_mask]['content'].tolist()
+        
+        # Generate name
+        cluster_name = summarize_cluster_content(cluster_messages)
+        cluster_names[cluster_id] = cluster_name
+    
+    return cluster_names
+
+
 def apply_clustering(embeddings, clustering_method="None", n_clusters=5):
    """
    Apply clustering algorithm to embeddings and return labels and metrics.
--- a/apps/cluster_map/config.py
+++ b/apps/cluster_map/config.py
@@ -3,7 +3,7 @@ Configuration settings and constants for the Discord Chat Embeddings Visualizer.
 """

 # Application settings
-APP_TITLE = "Discord Chat Embeddings Visualizer"
+APP_TITLE = "The Cult - Visualised"
 APP_ICON = "🗨️"
 APP_LAYOUT = "wide"

@@ -14,6 +14,8 @@ CHAT_LOGS_PATH = "../../discord_chat_logs"
 DEFAULT_RANDOM_STATE = 42
 DEFAULT_N_COMPONENTS = 2
 DEFAULT_N_CLUSTERS = 5
+DEFAULT_DIMENSION_REDUCTION_METHOD = "t-SNE"
+DEFAULT_CLUSTERING_METHOD = "None"

 # Visualization settings
 DEFAULT_POINT_SIZE = 8
--- a/apps/cluster_map/main.py
+++ b/apps/cluster_map/main.py
@@ -17,10 +17,10 @@ from data_loader import (
 from dimensionality_reduction import (
    reduce_dimensions, apply_density_based_jittering
 )
-from clustering import apply_clustering
+from clustering import apply_clustering, generate_cluster_names
 from visualization import (
    create_visualization_plot, display_clustering_metrics, display_summary_stats,
-    display_clustering_results, display_data_table
+    display_clustering_results, display_data_table, display_cluster_summary
 )


@@ -51,11 +51,34 @@ def main():
    # Get UI parameters
    params = get_all_ui_parameters(valid_df)
    
+    # Check if any sources are selected before proceeding
+    if not params['selected_sources']:
+        st.info("📂 **Select source files from the sidebar to begin visualization**")
+        st.markdown("### Available Data Sources:")
+        
+        # Show available sources as an informational table
+        source_info = []
+        for source in valid_df['source_file'].unique():
+            source_data = valid_df[valid_df['source_file'] == source]
+            source_info.append({
+                'Source File': source,
+                'Messages': len(source_data),
+                'Unique Authors': source_data['author_name'].nunique(),
+                'Date Range': f"{source_data['timestamp_utc'].min()} to {source_data['timestamp_utc'].max()}"
+            })
+        
+        import pandas as pd
+        source_df = pd.DataFrame(source_info)
+        st.dataframe(source_df, use_container_width=True, hide_index=True)
+        
+        st.markdown("👈 **Use the sidebar to select which sources to visualize**")
+        st.stop()
+    
    # Filter data
    filtered_df = filter_data(valid_df, params['selected_sources'], params['selected_authors'])
    
    if filtered_df.empty:
-        st.warning("No data matches the current filters!")
+        st.warning("No data matches the current filters! Try selecting different sources or authors.")
        st.stop()
    
    # Display performance warnings
@@ -95,12 +118,22 @@ def main():
                jitter_strength=params['jitter_strength']
            )
    
+    # Generate cluster names if clustering was applied
+    cluster_names = None
+    if cluster_labels is not None:
+        with st.spinner("Generating cluster names..."):
+            cluster_names = generate_cluster_names(filtered_df, cluster_labels)
+    
    # Display clustering metrics
    display_clustering_metrics(
        cluster_labels, silhouette_avg, calinski_harabasz, 
        params['show_cluster_metrics']
    )
    
+    # Display cluster summary with names
+    if cluster_names:
+        display_cluster_summary(cluster_names, cluster_labels)
+    
    # Create and display the main plot
    fig = create_visualization_plot(
        reduced_embeddings=reduced_embeddings,
@@ -113,7 +146,8 @@ def main():
        point_opacity=params['point_opacity'],
        density_based_sizing=params['density_based_sizing'],
        size_variation=params['size_variation'],
-        enable_3d=params['enable_3d']
+        enable_3d=params['enable_3d'],
+        cluster_names=cluster_names
    )
    
    st.plotly_chart(fig, use_container_width=True)
--- a/apps/cluster_map/ui_components.py
+++ b/apps/cluster_map/ui_components.py
@@ -7,7 +7,8 @@ import numpy as np
 from config import (
    APP_TITLE, APP_ICON, APP_LAYOUT, METHOD_EXPLANATIONS,
    CLUSTERING_METHODS_REQUIRING_N_CLUSTERS, COMPUTATIONALLY_INTENSIVE_METHODS,
-    LARGE_DATASET_WARNING_THRESHOLD, MAX_DISPLAYED_AUTHORS
+    LARGE_DATASET_WARNING_THRESHOLD, MAX_DISPLAYED_AUTHORS,
+    DEFAULT_DIMENSION_REDUCTION_METHOD, DEFAULT_CLUSTERING_METHOD
 )


@@ -38,17 +39,23 @@ def create_method_controls():
    )
    
    # Dimension reduction method
+    method_options = ["PCA", "t-SNE", "UMAP", "Spectral Embedding", "Force-Directed"]
+    default_index = method_options.index(DEFAULT_DIMENSION_REDUCTION_METHOD) if DEFAULT_DIMENSION_REDUCTION_METHOD in method_options else 0
    method = st.sidebar.selectbox(
        "Dimension Reduction Method",
-        ["PCA", "t-SNE", "UMAP", "Spectral Embedding", "Force-Directed"],
+        method_options,
+        index=default_index,
        help="PCA is fastest, UMAP balances speed and quality, t-SNE and Spectral are slower but may reveal better structures. Force-Directed creates natural spacing."
    )
    
    # Clustering method
+    clustering_options = ["None", "HDBSCAN", "Spectral Clustering", "Gaussian Mixture", 
+                         "Agglomerative (Ward)", "Agglomerative (Complete)", "OPTICS"]
+    clustering_default_index = clustering_options.index(DEFAULT_CLUSTERING_METHOD) if DEFAULT_CLUSTERING_METHOD in clustering_options else 0
    clustering_method = st.sidebar.selectbox(
        "Clustering Method",
-        ["None", "HDBSCAN", "Spectral Clustering", "Gaussian Mixture", 
-         "Agglomerative (Ward)", "Agglomerative (Complete)", "OPTICS"],
+        clustering_options,
+        index=clustering_default_index,
        help="Apply clustering to identify groups. HDBSCAN and OPTICS can find variable density clusters."
    )
    
@@ -57,9 +64,25 @@ def create_method_controls():

 def create_clustering_controls(clustering_method):
    """Create controls for clustering parameters"""
-    n_clusters = 5
+    # Always show the clusters slider, but indicate when it's used
    if clustering_method in CLUSTERING_METHODS_REQUIRING_N_CLUSTERS:
-        n_clusters = st.sidebar.slider("Number of Clusters", 2, 15, 5)
+        help_text = "Number of clusters to create. This setting affects the clustering algorithm."
+        disabled = False
+    elif clustering_method == "None":
+        help_text = "Clustering is disabled. This setting has no effect."
+        disabled = True
+    else:
+        help_text = f"{clustering_method} automatically determines the number of clusters. This setting has no effect."
+        disabled = True
+    
+    n_clusters = st.sidebar.slider(
+        "Number of Clusters", 
+        min_value=2, 
+        max_value=20, 
+        value=5,
+        disabled=disabled,
+        help=help_text
+    )
    
    return n_clusters

--- a/apps/cluster_map/visualization.py
+++ b/apps/cluster_map/visualization.py
@@ -47,7 +47,8 @@ def calculate_point_sizes(reduced_embeddings, density_based_sizing=False,


 def create_clustered_plot(reduced_embeddings, filtered_df, cluster_labels, hover_text, 
-                         point_sizes, point_opacity=DEFAULT_POINT_OPACITY, method="PCA", enable_3d=False):
+                         point_sizes, point_opacity=DEFAULT_POINT_OPACITY, method="PCA", enable_3d=False,
+                         cluster_names=None):
    """Create a plot colored by clusters"""
    fig = go.Figure()
    
@@ -61,7 +62,11 @@ def create_clustered_plot(reduced_embeddings, filtered_df, cluster_labels, hover
            cluster_hover = [hover_text[j] for j, mask in enumerate(cluster_mask) if mask]
            cluster_sizes = [point_sizes[j] for j, mask in enumerate(cluster_mask) if mask]
            
-            cluster_name = f"Cluster {cluster_id}" if cluster_id != -1 else "Noise"
+            # Use generated name if available, otherwise fall back to default
+            if cluster_names and cluster_id in cluster_names:
+                cluster_name = cluster_names[cluster_id]
+            else:
+                cluster_name = f"Cluster {cluster_id}" if cluster_id != -1 else "Noise"
            
            if enable_3d:
                fig.add_trace(go.Scatter3d(
@@ -149,7 +154,8 @@ def create_source_colored_plot(reduced_embeddings, filtered_df, selected_sources
 def create_visualization_plot(reduced_embeddings, filtered_df, cluster_labels=None, 
                             selected_sources=None, method="PCA", clustering_method="None",
                             point_size=DEFAULT_POINT_SIZE, point_opacity=DEFAULT_POINT_OPACITY,
-                             density_based_sizing=False, size_variation=2.0, enable_3d=False):
+                             density_based_sizing=False, size_variation=2.0, enable_3d=False,
+                             cluster_names=None):
    """Create the main visualization plot"""
    
    # Create hover text
@@ -162,7 +168,8 @@ def create_visualization_plot(reduced_embeddings, filtered_df, cluster_labels=No
    # Create plot based on coloring strategy
    if cluster_labels is not None:
        fig = create_clustered_plot(reduced_embeddings, filtered_df, cluster_labels, 
-                                   hover_text, point_sizes, point_opacity, method, enable_3d)
+                                   hover_text, point_sizes, point_opacity, method, enable_3d,
+                                   cluster_names)
    else:
        if selected_sources is None:
            selected_sources = filtered_df['source_file'].unique()
@@ -276,3 +283,29 @@ def display_data_table(filtered_df, cluster_labels=None):
        
    display_df['content'] = display_df['content'].str[:100] + '...'  # Truncate for display
    st.dataframe(display_df, use_container_width=True)
+
+
+def display_cluster_summary(cluster_names, cluster_labels):
+    """Display a summary of cluster names and their sizes"""
+    if not cluster_names or cluster_labels is None:
+        return
+        
+    st.subheader("🏷️ Cluster Summary")
+    
+    # Create summary data
+    cluster_summary = []
+    for cluster_id, name in cluster_names.items():
+        count = np.sum(cluster_labels == cluster_id)
+        cluster_summary.append({
+            'Cluster ID': cluster_id,
+            'Cluster Name': name,
+            'Message Count': count,
+            'Percentage': f"{100 * count / len(cluster_labels):.1f}%"
+        })
+    
+    # Sort by message count
+    cluster_summary.sort(key=lambda x: x['Message Count'], reverse=True)
+    
+    # Display as table
+    summary_df = pd.DataFrame(cluster_summary)
+    st.dataframe(summary_df, use_container_width=True, hide_index=True)