refactor

2025-08-11 02:37:21 +01:00
parent 6d35b42b27
commit 4ca7e8ab61
10 changed files with 1117 additions and 233 deletions
--- a/apps/cluster_map/ui_components.py
+++ b/apps/cluster_map/ui_components.py
@@ -0,0 +1,236 @@
+"""
+Streamlit UI components and controls for the Discord Chat Embeddings Visualizer.
+"""
+
+import streamlit as st
+import numpy as np
+from config import (
+    APP_TITLE, APP_ICON, APP_LAYOUT, METHOD_EXPLANATIONS,
+    CLUSTERING_METHODS_REQUIRING_N_CLUSTERS, COMPUTATIONALLY_INTENSIVE_METHODS,
+    LARGE_DATASET_WARNING_THRESHOLD, MAX_DISPLAYED_AUTHORS
+)
+
+
+def setup_page_config():
+    """Set up the Streamlit page configuration"""
+    st.set_page_config(
+        page_title=APP_TITLE,
+        page_icon=APP_ICON,
+        layout=APP_LAYOUT
+    )
+
+
+def display_title_and_description():
+    """Display the main title and description"""
+    st.title(f"{APP_ICON} {APP_TITLE}")
+    st.markdown("Explore Discord chat messages through their vector embeddings in 2D space")
+
+
+def create_method_controls():
+    """Create controls for dimension reduction and clustering methods"""
+    st.sidebar.header("🎛️ Visualization Controls")
+    
+    # Dimension reduction method
+    method = st.sidebar.selectbox(
+        "Dimension Reduction Method",
+        ["PCA", "t-SNE", "UMAP", "Spectral Embedding", "Force-Directed"],
+        help="PCA is fastest, UMAP balances speed and quality, t-SNE and Spectral are slower but may reveal better structures. Force-Directed creates natural spacing."
+    )
+    
+    # Clustering method
+    clustering_method = st.sidebar.selectbox(
+        "Clustering Method",
+        ["None", "HDBSCAN", "Spectral Clustering", "Gaussian Mixture", 
+         "Agglomerative (Ward)", "Agglomerative (Complete)", "OPTICS"],
+        help="Apply clustering to identify groups. HDBSCAN and OPTICS can find variable density clusters."
+    )
+    
+    return method, clustering_method
+
+
+def create_clustering_controls(clustering_method):
+    """Create controls for clustering parameters"""
+    n_clusters = 5
+    if clustering_method in CLUSTERING_METHODS_REQUIRING_N_CLUSTERS:
+        n_clusters = st.sidebar.slider("Number of Clusters", 2, 15, 5)
+    
+    return n_clusters
+
+
+def create_separation_controls(method):
+    """Create controls for point separation and method-specific parameters"""
+    st.sidebar.subheader("🎯 Point Separation Controls")
+    
+    spread_factor = st.sidebar.slider(
+        "Spread Factor", 
+        0.5, 3.0, 1.0, 0.1,
+        help="Increase to spread apart nearby points. Higher values create more separation."
+    )
+    
+    # Method-specific parameters
+    perplexity_factor = 1.0
+    min_dist_factor = 1.0
+    
+    if method == "t-SNE":
+        perplexity_factor = st.sidebar.slider(
+            "Perplexity Factor", 
+            0.5, 2.0, 1.0, 0.1,
+            help="Affects local vs global structure balance. Lower values focus on local details."
+        )
+        
+    if method == "UMAP":
+        min_dist_factor = st.sidebar.slider(
+            "Min Distance Factor", 
+            0.1, 2.0, 1.0, 0.1,
+            help="Controls how tightly points are packed. Lower values create tighter clusters."
+        )
+    
+    return spread_factor, perplexity_factor, min_dist_factor
+
+
+def create_jittering_controls():
+    """Create controls for jittering options"""
+    apply_jittering = st.sidebar.checkbox(
+        "Apply Smart Jittering", 
+        value=False,
+        help="Add intelligent noise to separate overlapping points"
+    )
+    
+    jitter_strength = 0.1
+    density_based_jitter = True
+    
+    if apply_jittering:
+        jitter_strength = st.sidebar.slider(
+            "Jitter Strength", 
+            0.01, 0.5, 0.1, 0.01,
+            help="Strength of jittering. Higher values spread points more."
+        )
+        density_based_jitter = st.sidebar.checkbox(
+            "Density-Based Jittering", 
+            value=True,
+            help="Apply stronger jittering in dense regions"
+        )
+    
+    return apply_jittering, jitter_strength, density_based_jitter
+
+
+def create_advanced_options():
+    """Create advanced visualization options"""
+    with st.sidebar.expander("⚙️ Advanced Options"):
+        show_cluster_metrics = st.checkbox("Show Clustering Metrics", value=True)
+        point_size = st.slider("Point Size", 4, 15, 8)
+        point_opacity = st.slider("Point Opacity", 0.3, 1.0, 0.7)
+        
+        # Density-based visualization
+        density_based_sizing = st.checkbox(
+            "Density-Based Point Sizing", 
+            value=False,
+            help="Make points larger in sparse regions, smaller in dense regions"
+        )
+        
+        size_variation = 2.0
+        if density_based_sizing:
+            size_variation = st.slider(
+                "Size Variation Factor", 
+                1.5, 4.0, 2.0, 0.1,
+                help="How much point sizes vary based on local density"
+            )
+    
+    return show_cluster_metrics, point_size, point_opacity, density_based_sizing, size_variation
+
+
+def create_filter_controls(valid_df):
+    """Create controls for filtering data by source and author"""
+    # Source file filter
+    source_files = valid_df['source_file'].unique()
+    selected_sources = st.sidebar.multiselect(
+        "Filter by Source Files",
+        source_files,
+        default=[],
+        help="Select which chat log files to include"
+    )
+    
+    # Author filter
+    authors = valid_df['author_name'].unique()
+    default_authors = authors[:MAX_DISPLAYED_AUTHORS] if len(authors) > MAX_DISPLAYED_AUTHORS else authors
+    selected_authors = st.sidebar.multiselect(
+        "Filter by Authors",
+        authors,
+        default=default_authors,
+        help="Select which authors to include"
+    )
+    
+    return selected_sources, selected_authors
+
+
+def display_method_explanations():
+    """Display explanations for different methods"""
+    st.sidebar.markdown("---")
+    with st.sidebar.expander("📚 Method Explanations"):
+        st.markdown("**Dimensionality Reduction:**")
+        for method, explanation in METHOD_EXPLANATIONS["dimension_reduction"].items():
+            st.markdown(f"- **{method}**: {explanation}")
+        
+        st.markdown("\n**Clustering Methods:**")
+        for method, explanation in METHOD_EXPLANATIONS["clustering"].items():
+            st.markdown(f"- **{method}**: {explanation}")
+        
+        st.markdown("\n**Separation Techniques:**")
+        for technique, explanation in METHOD_EXPLANATIONS["separation"].items():
+            st.markdown(f"- **{technique}**: {explanation}")
+        
+        st.markdown("\n**Metrics:**")
+        for metric, explanation in METHOD_EXPLANATIONS["metrics"].items():
+            st.markdown(f"- **{metric}**: {explanation}")
+
+
+def display_performance_warnings(filtered_df, method, clustering_method):
+    """Display performance warnings for computationally intensive operations"""
+    if len(filtered_df) > LARGE_DATASET_WARNING_THRESHOLD:
+        if method in COMPUTATIONALLY_INTENSIVE_METHODS["dimension_reduction"]:
+            st.warning(f"⚠️ {method} with {len(filtered_df)} points may take several minutes to compute.")
+        if clustering_method in COMPUTATIONALLY_INTENSIVE_METHODS["clustering"]:
+            st.warning(f"⚠️ {clustering_method} with {len(filtered_df)} points may be computationally intensive.")
+
+
+def get_all_ui_parameters(valid_df):
+    """Get all UI parameters in a single function call"""
+    # Method selection
+    method, clustering_method = create_method_controls()
+    
+    # Clustering parameters
+    n_clusters = create_clustering_controls(clustering_method)
+    
+    # Separation controls
+    spread_factor, perplexity_factor, min_dist_factor = create_separation_controls(method)
+    
+    # Jittering controls
+    apply_jittering, jitter_strength, density_based_jitter = create_jittering_controls()
+    
+    # Advanced options
+    show_cluster_metrics, point_size, point_opacity, density_based_sizing, size_variation = create_advanced_options()
+    
+    # Filters
+    selected_sources, selected_authors = create_filter_controls(valid_df)
+    
+    # Method explanations
+    display_method_explanations()
+    
+    return {
+        'method': method,
+        'clustering_method': clustering_method,
+        'n_clusters': n_clusters,
+        'spread_factor': spread_factor,
+        'perplexity_factor': perplexity_factor,
+        'min_dist_factor': min_dist_factor,
+        'apply_jittering': apply_jittering,
+        'jitter_strength': jitter_strength,
+        'density_based_jitter': density_based_jitter,
+        'show_cluster_metrics': show_cluster_metrics,
+        'point_size': point_size,
+        'point_opacity': point_opacity,
+        'density_based_sizing': density_based_sizing,
+        'size_variation': size_variation,
+        'selected_sources': selected_sources,
+        'selected_authors': selected_authors
+    }