cult-scraper/apps/cluster_map/ui_components.py

"""
Streamlit UI components and controls for the Discord Chat Embeddings Visualizer.
"""

import streamlit as st
import numpy as np
from config import (
    APP_TITLE, APP_ICON, APP_LAYOUT, METHOD_EXPLANATIONS,
    CLUSTERING_METHODS_REQUIRING_N_CLUSTERS, COMPUTATIONALLY_INTENSIVE_METHODS,
    LARGE_DATASET_WARNING_THRESHOLD, MAX_DISPLAYED_AUTHORS,
    DEFAULT_DIMENSION_REDUCTION_METHOD, DEFAULT_CLUSTERING_METHOD
)


def setup_page_config():
    """Set up the Streamlit page configuration"""
    st.set_page_config(
        page_title=APP_TITLE,
        page_icon=APP_ICON,
        layout=APP_LAYOUT
    )


def display_title_and_description():
    """Display the main title and description"""
    st.title(f"{APP_ICON} {APP_TITLE}")
    st.markdown("Explore Discord chat messages through their vector embeddings in 2D space")


def create_method_controls():
    """Create controls for dimension reduction and clustering methods"""
    st.sidebar.header("🎛️ Visualization Controls")

    # 3D visualization toggle
    enable_3d = st.sidebar.checkbox(
        "Enable 3D Visualization",
        value=False,
        help="Switch between 2D and 3D visualization. 3D uses 3 components instead of 2."
    )

    # Dimension reduction method
    method_options = ["PCA", "t-SNE", "UMAP", "Spectral Embedding", "Force-Directed"]
    default_index = method_options.index(DEFAULT_DIMENSION_REDUCTION_METHOD) if DEFAULT_DIMENSION_REDUCTION_METHOD in method_options else 0
    method = st.sidebar.selectbox(
        "Dimension Reduction Method",
        method_options,
        index=default_index,
        help="PCA is fastest, UMAP balances speed and quality, t-SNE and Spectral are slower but may reveal better structures. Force-Directed creates natural spacing."
    )

    # Clustering method
    clustering_options = ["None", "HDBSCAN", "Spectral Clustering", "Gaussian Mixture",
                         "Agglomerative (Ward)", "Agglomerative (Complete)", "OPTICS"]
    clustering_default_index = clustering_options.index(DEFAULT_CLUSTERING_METHOD) if DEFAULT_CLUSTERING_METHOD in clustering_options else 0
    clustering_method = st.sidebar.selectbox(
        "Clustering Method",
        clustering_options,
        index=clustering_default_index,
        help="Apply clustering to identify groups. HDBSCAN and OPTICS can find variable density clusters."
    )

    return method, clustering_method, enable_3d


def create_clustering_controls(clustering_method):
    """Create controls for clustering parameters"""
    # Always show the clusters slider, but indicate when it's used
    if clustering_method in CLUSTERING_METHODS_REQUIRING_N_CLUSTERS:
        help_text = "Number of clusters to create. This setting affects the clustering algorithm."
        disabled = False
    elif clustering_method == "None":
        help_text = "Clustering is disabled. This setting has no effect."
        disabled = True
    else:
        help_text = f"{clustering_method} automatically determines the number of clusters. This setting has no effect."
        disabled = True

    n_clusters = st.sidebar.slider(
        "Number of Clusters",
        min_value=2,
        max_value=20,
        value=5,
        disabled=disabled,
        help=help_text
    )

    return n_clusters


def create_separation_controls(method):
    """Create controls for point separation and method-specific parameters"""
    st.sidebar.subheader("🎯 Point Separation Controls")

    spread_factor = st.sidebar.slider(
        "Spread Factor",
        0.5, 3.0, 1.0, 0.1,
        help="Increase to spread apart nearby points. Higher values create more separation."
    )

    # Method-specific parameters
    perplexity_factor = 1.0
    min_dist_factor = 1.0

    if method == "t-SNE":
        perplexity_factor = st.sidebar.slider(
            "Perplexity Factor",
            0.1, 2.0, 1.0, 0.1,
            help="Affects local vs global structure balance. Lower values focus on local details."
        )

    if method == "UMAP":
        min_dist_factor = st.sidebar.slider(
            "Min Distance Factor",
            0.1, 2.0, 1.0, 0.1,
            help="Controls how tightly points are packed. Lower values create tighter clusters."
        )

    return spread_factor, perplexity_factor, min_dist_factor


def create_jittering_controls():
    """Create controls for jittering options"""
    apply_jittering = st.sidebar.checkbox(
        "Apply Smart Jittering",
        value=False,
        help="Add intelligent noise to separate overlapping points"
    )

    jitter_strength = 0.1
    density_based_jitter = True

    if apply_jittering:
        jitter_strength = st.sidebar.slider(
            "Jitter Strength",
            0.01, 0.5, 0.1, 0.01,
            help="Strength of jittering. Higher values spread points more."
        )
        density_based_jitter = st.sidebar.checkbox(
            "Density-Based Jittering",
            value=True,
            help="Apply stronger jittering in dense regions"
        )

    return apply_jittering, jitter_strength, density_based_jitter


def create_advanced_options():
    """Create advanced visualization options"""
    with st.sidebar.expander("⚙️ Advanced Options"):
        show_cluster_metrics = st.checkbox("Show Clustering Metrics", value=True)
        point_size = st.slider("Point Size", 4, 15, 8)
        point_opacity = st.slider("Point Opacity", 0.3, 1.0, 0.7)

        # Density-based visualization
        density_based_sizing = st.checkbox(
            "Density-Based Point Sizing",
            value=False,
            help="Make points larger in sparse regions, smaller in dense regions"
        )

        size_variation = 2.0
        if density_based_sizing:
            size_variation = st.slider(
                "Size Variation Factor",
                1.5, 4.0, 2.0, 0.1,
                help="How much point sizes vary based on local density"
            )

    return show_cluster_metrics, point_size, point_opacity, density_based_sizing, size_variation


def create_filter_controls(valid_df):
    """Create controls for filtering data by source and author"""
    # Source file filter
    source_files = valid_df['source_file'].unique()
    selected_sources = st.sidebar.multiselect(
        "Filter by Source Files",
        source_files,
        default=[],
        help="Select which chat log files to include"
    )

    # Author filter
    authors = valid_df['author_name'].unique()
    default_authors = authors[:MAX_DISPLAYED_AUTHORS] if len(authors) > MAX_DISPLAYED_AUTHORS else authors
    selected_authors = st.sidebar.multiselect(
        "Filter by Authors",
        authors,
        default=default_authors,
        help="Select which authors to include"
    )

    return selected_sources, selected_authors


def display_method_explanations():
    """Display explanations for different methods"""
    st.sidebar.markdown("---")
    with st.sidebar.expander("📚 Method Explanations"):
        st.markdown("**Dimensionality Reduction:**")
        for method, explanation in METHOD_EXPLANATIONS["dimension_reduction"].items():
            st.markdown(f"- **{method}**: {explanation}")

        st.markdown("\n**Clustering Methods:**")
        for method, explanation in METHOD_EXPLANATIONS["clustering"].items():
            st.markdown(f"- **{method}**: {explanation}")

        st.markdown("\n**Separation Techniques:**")
        for technique, explanation in METHOD_EXPLANATIONS["separation"].items():
            st.markdown(f"- **{technique}**: {explanation}")

        st.markdown("\n**Metrics:**")
        for metric, explanation in METHOD_EXPLANATIONS["metrics"].items():
            st.markdown(f"- **{metric}**: {explanation}")


def display_performance_warnings(filtered_df, method, clustering_method):
    """Display performance warnings for computationally intensive operations"""
    if len(filtered_df) > LARGE_DATASET_WARNING_THRESHOLD:
        if method in COMPUTATIONALLY_INTENSIVE_METHODS["dimension_reduction"]:
            st.warning(f"⚠️ {method} with {len(filtered_df)} points may take several minutes to compute.")
        if clustering_method in COMPUTATIONALLY_INTENSIVE_METHODS["clustering"]:
            st.warning(f"⚠️ {clustering_method} with {len(filtered_df)} points may be computationally intensive.")


def get_all_ui_parameters(valid_df):
    """Get all UI parameters in a single function call"""
    # Method selection
    method, clustering_method, enable_3d = create_method_controls()

    # Clustering parameters
    n_clusters = create_clustering_controls(clustering_method)

    # Separation controls
    spread_factor, perplexity_factor, min_dist_factor = create_separation_controls(method)

    # Jittering controls
    apply_jittering, jitter_strength, density_based_jitter = create_jittering_controls()

    # Advanced options
    show_cluster_metrics, point_size, point_opacity, density_based_sizing, size_variation = create_advanced_options()

    # Filters
    selected_sources, selected_authors = create_filter_controls(valid_df)

    # Method explanations
    display_method_explanations()

    return {
        'method': method,
        'clustering_method': clustering_method,
        'enable_3d': enable_3d,
        'n_clusters': n_clusters,
        'spread_factor': spread_factor,
        'perplexity_factor': perplexity_factor,
        'min_dist_factor': min_dist_factor,
        'apply_jittering': apply_jittering,
        'jitter_strength': jitter_strength,
        'density_based_jitter': density_based_jitter,
        'show_cluster_metrics': show_cluster_metrics,
        'point_size': point_size,
        'point_opacity': point_opacity,
        'density_based_sizing': density_based_sizing,
        'size_variation': size_variation,
        'selected_sources': selected_sources,
        'selected_authors': selected_authors
    }