268 lines
9.8 KiB
Python
268 lines
9.8 KiB
Python
"""
|
|
Streamlit UI components and controls for the Discord Chat Embeddings Visualizer.
|
|
"""
|
|
|
|
import streamlit as st
|
|
import numpy as np
|
|
from config import (
|
|
APP_TITLE, APP_ICON, APP_LAYOUT, METHOD_EXPLANATIONS,
|
|
CLUSTERING_METHODS_REQUIRING_N_CLUSTERS, COMPUTATIONALLY_INTENSIVE_METHODS,
|
|
LARGE_DATASET_WARNING_THRESHOLD, MAX_DISPLAYED_AUTHORS,
|
|
DEFAULT_DIMENSION_REDUCTION_METHOD, DEFAULT_CLUSTERING_METHOD
|
|
)
|
|
|
|
|
|
def setup_page_config():
|
|
"""Set up the Streamlit page configuration"""
|
|
st.set_page_config(
|
|
page_title=APP_TITLE,
|
|
page_icon=APP_ICON,
|
|
layout=APP_LAYOUT
|
|
)
|
|
|
|
|
|
def display_title_and_description():
|
|
"""Display the main title and description"""
|
|
st.title(f"{APP_ICON} {APP_TITLE}")
|
|
st.markdown("Explore Discord chat messages through their vector embeddings in 2D space")
|
|
|
|
|
|
def create_method_controls():
|
|
"""Create controls for dimension reduction and clustering methods"""
|
|
st.sidebar.header("🎛️ Visualization Controls")
|
|
|
|
# 3D visualization toggle
|
|
enable_3d = st.sidebar.checkbox(
|
|
"Enable 3D Visualization",
|
|
value=False,
|
|
help="Switch between 2D and 3D visualization. 3D uses 3 components instead of 2."
|
|
)
|
|
|
|
# Dimension reduction method
|
|
method_options = ["PCA", "t-SNE", "UMAP", "Spectral Embedding", "Force-Directed"]
|
|
default_index = method_options.index(DEFAULT_DIMENSION_REDUCTION_METHOD) if DEFAULT_DIMENSION_REDUCTION_METHOD in method_options else 0
|
|
method = st.sidebar.selectbox(
|
|
"Dimension Reduction Method",
|
|
method_options,
|
|
index=default_index,
|
|
help="PCA is fastest, UMAP balances speed and quality, t-SNE and Spectral are slower but may reveal better structures. Force-Directed creates natural spacing."
|
|
)
|
|
|
|
# Clustering method
|
|
clustering_options = ["None", "HDBSCAN", "Spectral Clustering", "Gaussian Mixture",
|
|
"Agglomerative (Ward)", "Agglomerative (Complete)", "OPTICS"]
|
|
clustering_default_index = clustering_options.index(DEFAULT_CLUSTERING_METHOD) if DEFAULT_CLUSTERING_METHOD in clustering_options else 0
|
|
clustering_method = st.sidebar.selectbox(
|
|
"Clustering Method",
|
|
clustering_options,
|
|
index=clustering_default_index,
|
|
help="Apply clustering to identify groups. HDBSCAN and OPTICS can find variable density clusters."
|
|
)
|
|
|
|
return method, clustering_method, enable_3d
|
|
|
|
|
|
def create_clustering_controls(clustering_method):
|
|
"""Create controls for clustering parameters"""
|
|
# Always show the clusters slider, but indicate when it's used
|
|
if clustering_method in CLUSTERING_METHODS_REQUIRING_N_CLUSTERS:
|
|
help_text = "Number of clusters to create. This setting affects the clustering algorithm."
|
|
disabled = False
|
|
elif clustering_method == "None":
|
|
help_text = "Clustering is disabled. This setting has no effect."
|
|
disabled = True
|
|
else:
|
|
help_text = f"{clustering_method} automatically determines the number of clusters. This setting has no effect."
|
|
disabled = True
|
|
|
|
n_clusters = st.sidebar.slider(
|
|
"Number of Clusters",
|
|
min_value=2,
|
|
max_value=20,
|
|
value=5,
|
|
disabled=disabled,
|
|
help=help_text
|
|
)
|
|
|
|
return n_clusters
|
|
|
|
|
|
def create_separation_controls(method):
|
|
"""Create controls for point separation and method-specific parameters"""
|
|
st.sidebar.subheader("🎯 Point Separation Controls")
|
|
|
|
spread_factor = st.sidebar.slider(
|
|
"Spread Factor",
|
|
0.5, 3.0, 1.0, 0.1,
|
|
help="Increase to spread apart nearby points. Higher values create more separation."
|
|
)
|
|
|
|
# Method-specific parameters
|
|
perplexity_factor = 1.0
|
|
min_dist_factor = 1.0
|
|
|
|
if method == "t-SNE":
|
|
perplexity_factor = st.sidebar.slider(
|
|
"Perplexity Factor",
|
|
0.1, 2.0, 1.0, 0.1,
|
|
help="Affects local vs global structure balance. Lower values focus on local details."
|
|
)
|
|
|
|
if method == "UMAP":
|
|
min_dist_factor = st.sidebar.slider(
|
|
"Min Distance Factor",
|
|
0.1, 2.0, 1.0, 0.1,
|
|
help="Controls how tightly points are packed. Lower values create tighter clusters."
|
|
)
|
|
|
|
return spread_factor, perplexity_factor, min_dist_factor
|
|
|
|
|
|
def create_jittering_controls():
|
|
"""Create controls for jittering options"""
|
|
apply_jittering = st.sidebar.checkbox(
|
|
"Apply Smart Jittering",
|
|
value=False,
|
|
help="Add intelligent noise to separate overlapping points"
|
|
)
|
|
|
|
jitter_strength = 0.1
|
|
density_based_jitter = True
|
|
|
|
if apply_jittering:
|
|
jitter_strength = st.sidebar.slider(
|
|
"Jitter Strength",
|
|
0.01, 0.5, 0.1, 0.01,
|
|
help="Strength of jittering. Higher values spread points more."
|
|
)
|
|
density_based_jitter = st.sidebar.checkbox(
|
|
"Density-Based Jittering",
|
|
value=True,
|
|
help="Apply stronger jittering in dense regions"
|
|
)
|
|
|
|
return apply_jittering, jitter_strength, density_based_jitter
|
|
|
|
|
|
def create_advanced_options():
|
|
"""Create advanced visualization options"""
|
|
with st.sidebar.expander("⚙️ Advanced Options"):
|
|
show_cluster_metrics = st.checkbox("Show Clustering Metrics", value=True)
|
|
point_size = st.slider("Point Size", 4, 15, 8)
|
|
point_opacity = st.slider("Point Opacity", 0.3, 1.0, 0.7)
|
|
|
|
# Density-based visualization
|
|
density_based_sizing = st.checkbox(
|
|
"Density-Based Point Sizing",
|
|
value=False,
|
|
help="Make points larger in sparse regions, smaller in dense regions"
|
|
)
|
|
|
|
size_variation = 2.0
|
|
if density_based_sizing:
|
|
size_variation = st.slider(
|
|
"Size Variation Factor",
|
|
1.5, 4.0, 2.0, 0.1,
|
|
help="How much point sizes vary based on local density"
|
|
)
|
|
|
|
return show_cluster_metrics, point_size, point_opacity, density_based_sizing, size_variation
|
|
|
|
|
|
def create_filter_controls(valid_df):
|
|
"""Create controls for filtering data by source and author"""
|
|
# Source file filter
|
|
source_files = valid_df['source_file'].unique()
|
|
selected_sources = st.sidebar.multiselect(
|
|
"Filter by Source Files",
|
|
source_files,
|
|
default=[],
|
|
help="Select which chat log files to include"
|
|
)
|
|
|
|
# Author filter
|
|
authors = valid_df['author_name'].unique()
|
|
default_authors = authors[:MAX_DISPLAYED_AUTHORS] if len(authors) > MAX_DISPLAYED_AUTHORS else authors
|
|
selected_authors = st.sidebar.multiselect(
|
|
"Filter by Authors",
|
|
authors,
|
|
default=default_authors,
|
|
help="Select which authors to include"
|
|
)
|
|
|
|
return selected_sources, selected_authors
|
|
|
|
|
|
def display_method_explanations():
|
|
"""Display explanations for different methods"""
|
|
st.sidebar.markdown("---")
|
|
with st.sidebar.expander("📚 Method Explanations"):
|
|
st.markdown("**Dimensionality Reduction:**")
|
|
for method, explanation in METHOD_EXPLANATIONS["dimension_reduction"].items():
|
|
st.markdown(f"- **{method}**: {explanation}")
|
|
|
|
st.markdown("\n**Clustering Methods:**")
|
|
for method, explanation in METHOD_EXPLANATIONS["clustering"].items():
|
|
st.markdown(f"- **{method}**: {explanation}")
|
|
|
|
st.markdown("\n**Separation Techniques:**")
|
|
for technique, explanation in METHOD_EXPLANATIONS["separation"].items():
|
|
st.markdown(f"- **{technique}**: {explanation}")
|
|
|
|
st.markdown("\n**Metrics:**")
|
|
for metric, explanation in METHOD_EXPLANATIONS["metrics"].items():
|
|
st.markdown(f"- **{metric}**: {explanation}")
|
|
|
|
|
|
def display_performance_warnings(filtered_df, method, clustering_method):
|
|
"""Display performance warnings for computationally intensive operations"""
|
|
if len(filtered_df) > LARGE_DATASET_WARNING_THRESHOLD:
|
|
if method in COMPUTATIONALLY_INTENSIVE_METHODS["dimension_reduction"]:
|
|
st.warning(f"⚠️ {method} with {len(filtered_df)} points may take several minutes to compute.")
|
|
if clustering_method in COMPUTATIONALLY_INTENSIVE_METHODS["clustering"]:
|
|
st.warning(f"⚠️ {clustering_method} with {len(filtered_df)} points may be computationally intensive.")
|
|
|
|
|
|
def get_all_ui_parameters(valid_df):
|
|
"""Get all UI parameters in a single function call"""
|
|
# Method selection
|
|
method, clustering_method, enable_3d = create_method_controls()
|
|
|
|
# Clustering parameters
|
|
n_clusters = create_clustering_controls(clustering_method)
|
|
|
|
# Separation controls
|
|
spread_factor, perplexity_factor, min_dist_factor = create_separation_controls(method)
|
|
|
|
# Jittering controls
|
|
apply_jittering, jitter_strength, density_based_jitter = create_jittering_controls()
|
|
|
|
# Advanced options
|
|
show_cluster_metrics, point_size, point_opacity, density_based_sizing, size_variation = create_advanced_options()
|
|
|
|
# Filters
|
|
selected_sources, selected_authors = create_filter_controls(valid_df)
|
|
|
|
# Method explanations
|
|
display_method_explanations()
|
|
|
|
return {
|
|
'method': method,
|
|
'clustering_method': clustering_method,
|
|
'enable_3d': enable_3d,
|
|
'n_clusters': n_clusters,
|
|
'spread_factor': spread_factor,
|
|
'perplexity_factor': perplexity_factor,
|
|
'min_dist_factor': min_dist_factor,
|
|
'apply_jittering': apply_jittering,
|
|
'jitter_strength': jitter_strength,
|
|
'density_based_jitter': density_based_jitter,
|
|
'show_cluster_metrics': show_cluster_metrics,
|
|
'point_size': point_size,
|
|
'point_opacity': point_opacity,
|
|
'density_based_sizing': density_based_sizing,
|
|
'size_variation': size_variation,
|
|
'selected_sources': selected_sources,
|
|
'selected_authors': selected_authors
|
|
}
|