""" Streamlit UI components and controls for the Discord Chat Embeddings Visualizer. """ import streamlit as st import numpy as np from config import ( APP_TITLE, APP_ICON, APP_LAYOUT, METHOD_EXPLANATIONS, CLUSTERING_METHODS_REQUIRING_N_CLUSTERS, COMPUTATIONALLY_INTENSIVE_METHODS, LARGE_DATASET_WARNING_THRESHOLD, MAX_DISPLAYED_AUTHORS, DEFAULT_DIMENSION_REDUCTION_METHOD, DEFAULT_CLUSTERING_METHOD ) def setup_page_config(): """Set up the Streamlit page configuration""" st.set_page_config( page_title=APP_TITLE, page_icon=APP_ICON, layout=APP_LAYOUT ) def display_title_and_description(): """Display the main title and description""" st.title(f"{APP_ICON} {APP_TITLE}") st.markdown("Explore Discord chat messages through their vector embeddings in 2D space") def create_method_controls(): """Create controls for dimension reduction and clustering methods""" st.sidebar.header("🎛️ Visualization Controls") # 3D visualization toggle enable_3d = st.sidebar.checkbox( "Enable 3D Visualization", value=False, help="Switch between 2D and 3D visualization. 3D uses 3 components instead of 2." ) # Dimension reduction method method_options = ["PCA", "t-SNE", "UMAP", "Spectral Embedding", "Force-Directed"] default_index = method_options.index(DEFAULT_DIMENSION_REDUCTION_METHOD) if DEFAULT_DIMENSION_REDUCTION_METHOD in method_options else 0 method = st.sidebar.selectbox( "Dimension Reduction Method", method_options, index=default_index, help="PCA is fastest, UMAP balances speed and quality, t-SNE and Spectral are slower but may reveal better structures. Force-Directed creates natural spacing." ) # Clustering method clustering_options = ["None", "HDBSCAN", "Spectral Clustering", "Gaussian Mixture", "Agglomerative (Ward)", "Agglomerative (Complete)", "OPTICS"] clustering_default_index = clustering_options.index(DEFAULT_CLUSTERING_METHOD) if DEFAULT_CLUSTERING_METHOD in clustering_options else 0 clustering_method = st.sidebar.selectbox( "Clustering Method", clustering_options, index=clustering_default_index, help="Apply clustering to identify groups. HDBSCAN and OPTICS can find variable density clusters." ) return method, clustering_method, enable_3d def create_clustering_controls(clustering_method): """Create controls for clustering parameters""" # Always show the clusters slider, but indicate when it's used if clustering_method in CLUSTERING_METHODS_REQUIRING_N_CLUSTERS: help_text = "Number of clusters to create. This setting affects the clustering algorithm." disabled = False elif clustering_method == "None": help_text = "Clustering is disabled. This setting has no effect." disabled = True else: help_text = f"{clustering_method} automatically determines the number of clusters. This setting has no effect." disabled = True n_clusters = st.sidebar.slider( "Number of Clusters", min_value=2, max_value=20, value=5, disabled=disabled, help=help_text ) return n_clusters def create_separation_controls(method): """Create controls for point separation and method-specific parameters""" st.sidebar.subheader("🎯 Point Separation Controls") spread_factor = st.sidebar.slider( "Spread Factor", 0.5, 3.0, 1.0, 0.1, help="Increase to spread apart nearby points. Higher values create more separation." ) # Method-specific parameters perplexity_factor = 1.0 min_dist_factor = 1.0 if method == "t-SNE": perplexity_factor = st.sidebar.slider( "Perplexity Factor", 0.1, 2.0, 1.0, 0.1, help="Affects local vs global structure balance. Lower values focus on local details." ) if method == "UMAP": min_dist_factor = st.sidebar.slider( "Min Distance Factor", 0.1, 2.0, 1.0, 0.1, help="Controls how tightly points are packed. Lower values create tighter clusters." ) return spread_factor, perplexity_factor, min_dist_factor def create_jittering_controls(): """Create controls for jittering options""" apply_jittering = st.sidebar.checkbox( "Apply Smart Jittering", value=False, help="Add intelligent noise to separate overlapping points" ) jitter_strength = 0.1 density_based_jitter = True if apply_jittering: jitter_strength = st.sidebar.slider( "Jitter Strength", 0.01, 0.5, 0.1, 0.01, help="Strength of jittering. Higher values spread points more." ) density_based_jitter = st.sidebar.checkbox( "Density-Based Jittering", value=True, help="Apply stronger jittering in dense regions" ) return apply_jittering, jitter_strength, density_based_jitter def create_advanced_options(): """Create advanced visualization options""" with st.sidebar.expander("⚙️ Advanced Options"): show_cluster_metrics = st.checkbox("Show Clustering Metrics", value=True) point_size = st.slider("Point Size", 4, 15, 8) point_opacity = st.slider("Point Opacity", 0.3, 1.0, 0.7) # Density-based visualization density_based_sizing = st.checkbox( "Density-Based Point Sizing", value=False, help="Make points larger in sparse regions, smaller in dense regions" ) size_variation = 2.0 if density_based_sizing: size_variation = st.slider( "Size Variation Factor", 1.5, 4.0, 2.0, 0.1, help="How much point sizes vary based on local density" ) return show_cluster_metrics, point_size, point_opacity, density_based_sizing, size_variation def create_filter_controls(valid_df): """Create controls for filtering data by source and author""" # Source file filter source_files = valid_df['source_file'].unique() selected_sources = st.sidebar.multiselect( "Filter by Source Files", source_files, default=[], help="Select which chat log files to include" ) # Author filter authors = valid_df['author_name'].unique() default_authors = authors[:MAX_DISPLAYED_AUTHORS] if len(authors) > MAX_DISPLAYED_AUTHORS else authors selected_authors = st.sidebar.multiselect( "Filter by Authors", authors, default=default_authors, help="Select which authors to include" ) return selected_sources, selected_authors def display_method_explanations(): """Display explanations for different methods""" st.sidebar.markdown("---") with st.sidebar.expander("📚 Method Explanations"): st.markdown("**Dimensionality Reduction:**") for method, explanation in METHOD_EXPLANATIONS["dimension_reduction"].items(): st.markdown(f"- **{method}**: {explanation}") st.markdown("\n**Clustering Methods:**") for method, explanation in METHOD_EXPLANATIONS["clustering"].items(): st.markdown(f"- **{method}**: {explanation}") st.markdown("\n**Separation Techniques:**") for technique, explanation in METHOD_EXPLANATIONS["separation"].items(): st.markdown(f"- **{technique}**: {explanation}") st.markdown("\n**Metrics:**") for metric, explanation in METHOD_EXPLANATIONS["metrics"].items(): st.markdown(f"- **{metric}**: {explanation}") def display_performance_warnings(filtered_df, method, clustering_method): """Display performance warnings for computationally intensive operations""" if len(filtered_df) > LARGE_DATASET_WARNING_THRESHOLD: if method in COMPUTATIONALLY_INTENSIVE_METHODS["dimension_reduction"]: st.warning(f"⚠️ {method} with {len(filtered_df)} points may take several minutes to compute.") if clustering_method in COMPUTATIONALLY_INTENSIVE_METHODS["clustering"]: st.warning(f"⚠️ {clustering_method} with {len(filtered_df)} points may be computationally intensive.") def get_all_ui_parameters(valid_df): """Get all UI parameters in a single function call""" # Method selection method, clustering_method, enable_3d = create_method_controls() # Clustering parameters n_clusters = create_clustering_controls(clustering_method) # Separation controls spread_factor, perplexity_factor, min_dist_factor = create_separation_controls(method) # Jittering controls apply_jittering, jitter_strength, density_based_jitter = create_jittering_controls() # Advanced options show_cluster_metrics, point_size, point_opacity, density_based_sizing, size_variation = create_advanced_options() # Filters selected_sources, selected_authors = create_filter_controls(valid_df) # Method explanations display_method_explanations() return { 'method': method, 'clustering_method': clustering_method, 'enable_3d': enable_3d, 'n_clusters': n_clusters, 'spread_factor': spread_factor, 'perplexity_factor': perplexity_factor, 'min_dist_factor': min_dist_factor, 'apply_jittering': apply_jittering, 'jitter_strength': jitter_strength, 'density_based_jitter': density_based_jitter, 'show_cluster_metrics': show_cluster_metrics, 'point_size': point_size, 'point_opacity': point_opacity, 'density_based_sizing': density_based_sizing, 'size_variation': size_variation, 'selected_sources': selected_sources, 'selected_authors': selected_authors }