This commit is contained in:
2025-08-11 02:37:21 +01:00
parent 6d35b42b27
commit 4ca7e8ab61
10 changed files with 1117 additions and 233 deletions

View File

@@ -0,0 +1,236 @@
"""
Streamlit UI components and controls for the Discord Chat Embeddings Visualizer.
"""
import streamlit as st
import numpy as np
from config import (
APP_TITLE, APP_ICON, APP_LAYOUT, METHOD_EXPLANATIONS,
CLUSTERING_METHODS_REQUIRING_N_CLUSTERS, COMPUTATIONALLY_INTENSIVE_METHODS,
LARGE_DATASET_WARNING_THRESHOLD, MAX_DISPLAYED_AUTHORS
)
def setup_page_config():
"""Set up the Streamlit page configuration"""
st.set_page_config(
page_title=APP_TITLE,
page_icon=APP_ICON,
layout=APP_LAYOUT
)
def display_title_and_description():
"""Display the main title and description"""
st.title(f"{APP_ICON} {APP_TITLE}")
st.markdown("Explore Discord chat messages through their vector embeddings in 2D space")
def create_method_controls():
"""Create controls for dimension reduction and clustering methods"""
st.sidebar.header("🎛️ Visualization Controls")
# Dimension reduction method
method = st.sidebar.selectbox(
"Dimension Reduction Method",
["PCA", "t-SNE", "UMAP", "Spectral Embedding", "Force-Directed"],
help="PCA is fastest, UMAP balances speed and quality, t-SNE and Spectral are slower but may reveal better structures. Force-Directed creates natural spacing."
)
# Clustering method
clustering_method = st.sidebar.selectbox(
"Clustering Method",
["None", "HDBSCAN", "Spectral Clustering", "Gaussian Mixture",
"Agglomerative (Ward)", "Agglomerative (Complete)", "OPTICS"],
help="Apply clustering to identify groups. HDBSCAN and OPTICS can find variable density clusters."
)
return method, clustering_method
def create_clustering_controls(clustering_method):
"""Create controls for clustering parameters"""
n_clusters = 5
if clustering_method in CLUSTERING_METHODS_REQUIRING_N_CLUSTERS:
n_clusters = st.sidebar.slider("Number of Clusters", 2, 15, 5)
return n_clusters
def create_separation_controls(method):
"""Create controls for point separation and method-specific parameters"""
st.sidebar.subheader("🎯 Point Separation Controls")
spread_factor = st.sidebar.slider(
"Spread Factor",
0.5, 3.0, 1.0, 0.1,
help="Increase to spread apart nearby points. Higher values create more separation."
)
# Method-specific parameters
perplexity_factor = 1.0
min_dist_factor = 1.0
if method == "t-SNE":
perplexity_factor = st.sidebar.slider(
"Perplexity Factor",
0.5, 2.0, 1.0, 0.1,
help="Affects local vs global structure balance. Lower values focus on local details."
)
if method == "UMAP":
min_dist_factor = st.sidebar.slider(
"Min Distance Factor",
0.1, 2.0, 1.0, 0.1,
help="Controls how tightly points are packed. Lower values create tighter clusters."
)
return spread_factor, perplexity_factor, min_dist_factor
def create_jittering_controls():
"""Create controls for jittering options"""
apply_jittering = st.sidebar.checkbox(
"Apply Smart Jittering",
value=False,
help="Add intelligent noise to separate overlapping points"
)
jitter_strength = 0.1
density_based_jitter = True
if apply_jittering:
jitter_strength = st.sidebar.slider(
"Jitter Strength",
0.01, 0.5, 0.1, 0.01,
help="Strength of jittering. Higher values spread points more."
)
density_based_jitter = st.sidebar.checkbox(
"Density-Based Jittering",
value=True,
help="Apply stronger jittering in dense regions"
)
return apply_jittering, jitter_strength, density_based_jitter
def create_advanced_options():
"""Create advanced visualization options"""
with st.sidebar.expander("⚙️ Advanced Options"):
show_cluster_metrics = st.checkbox("Show Clustering Metrics", value=True)
point_size = st.slider("Point Size", 4, 15, 8)
point_opacity = st.slider("Point Opacity", 0.3, 1.0, 0.7)
# Density-based visualization
density_based_sizing = st.checkbox(
"Density-Based Point Sizing",
value=False,
help="Make points larger in sparse regions, smaller in dense regions"
)
size_variation = 2.0
if density_based_sizing:
size_variation = st.slider(
"Size Variation Factor",
1.5, 4.0, 2.0, 0.1,
help="How much point sizes vary based on local density"
)
return show_cluster_metrics, point_size, point_opacity, density_based_sizing, size_variation
def create_filter_controls(valid_df):
"""Create controls for filtering data by source and author"""
# Source file filter
source_files = valid_df['source_file'].unique()
selected_sources = st.sidebar.multiselect(
"Filter by Source Files",
source_files,
default=[],
help="Select which chat log files to include"
)
# Author filter
authors = valid_df['author_name'].unique()
default_authors = authors[:MAX_DISPLAYED_AUTHORS] if len(authors) > MAX_DISPLAYED_AUTHORS else authors
selected_authors = st.sidebar.multiselect(
"Filter by Authors",
authors,
default=default_authors,
help="Select which authors to include"
)
return selected_sources, selected_authors
def display_method_explanations():
"""Display explanations for different methods"""
st.sidebar.markdown("---")
with st.sidebar.expander("📚 Method Explanations"):
st.markdown("**Dimensionality Reduction:**")
for method, explanation in METHOD_EXPLANATIONS["dimension_reduction"].items():
st.markdown(f"- **{method}**: {explanation}")
st.markdown("\n**Clustering Methods:**")
for method, explanation in METHOD_EXPLANATIONS["clustering"].items():
st.markdown(f"- **{method}**: {explanation}")
st.markdown("\n**Separation Techniques:**")
for technique, explanation in METHOD_EXPLANATIONS["separation"].items():
st.markdown(f"- **{technique}**: {explanation}")
st.markdown("\n**Metrics:**")
for metric, explanation in METHOD_EXPLANATIONS["metrics"].items():
st.markdown(f"- **{metric}**: {explanation}")
def display_performance_warnings(filtered_df, method, clustering_method):
"""Display performance warnings for computationally intensive operations"""
if len(filtered_df) > LARGE_DATASET_WARNING_THRESHOLD:
if method in COMPUTATIONALLY_INTENSIVE_METHODS["dimension_reduction"]:
st.warning(f"⚠️ {method} with {len(filtered_df)} points may take several minutes to compute.")
if clustering_method in COMPUTATIONALLY_INTENSIVE_METHODS["clustering"]:
st.warning(f"⚠️ {clustering_method} with {len(filtered_df)} points may be computationally intensive.")
def get_all_ui_parameters(valid_df):
"""Get all UI parameters in a single function call"""
# Method selection
method, clustering_method = create_method_controls()
# Clustering parameters
n_clusters = create_clustering_controls(clustering_method)
# Separation controls
spread_factor, perplexity_factor, min_dist_factor = create_separation_controls(method)
# Jittering controls
apply_jittering, jitter_strength, density_based_jitter = create_jittering_controls()
# Advanced options
show_cluster_metrics, point_size, point_opacity, density_based_sizing, size_variation = create_advanced_options()
# Filters
selected_sources, selected_authors = create_filter_controls(valid_df)
# Method explanations
display_method_explanations()
return {
'method': method,
'clustering_method': clustering_method,
'n_clusters': n_clusters,
'spread_factor': spread_factor,
'perplexity_factor': perplexity_factor,
'min_dist_factor': min_dist_factor,
'apply_jittering': apply_jittering,
'jitter_strength': jitter_strength,
'density_based_jitter': density_based_jitter,
'show_cluster_metrics': show_cluster_metrics,
'point_size': point_size,
'point_opacity': point_opacity,
'density_based_sizing': density_based_sizing,
'size_variation': size_variation,
'selected_sources': selected_sources,
'selected_authors': selected_authors
}