clustermap app
This commit is contained in:
58
apps/cluster_map/README.md
Normal file
58
apps/cluster_map/README.md
Normal file
@@ -0,0 +1,58 @@
|
||||
# Discord Chat Embeddings Visualizer
|
||||
|
||||
A Streamlit application that visualizes Discord chat messages using their vector embeddings in 2D space.
|
||||
|
||||
## Features
|
||||
|
||||
- **2D Visualization**: View chat messages plotted using PCA or t-SNE dimension reduction
|
||||
- **Interactive Plotting**: Hover over points to see message content, author, and timestamp
|
||||
- **Filtering**: Filter by source chat log files and authors
|
||||
- **Multiple Datasets**: Automatically loads all CSV files from the discord_chat_logs folder
|
||||
|
||||
## Installation
|
||||
|
||||
1. Install the required dependencies:
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
Run the Streamlit application:
|
||||
|
||||
```bash
|
||||
streamlit run streamlit_app.py
|
||||
```
|
||||
|
||||
The app will automatically load all CSV files from the `../../discord_chat_logs/` directory.
|
||||
|
||||
## Data Format
|
||||
|
||||
The application expects CSV files with the following columns:
|
||||
- `message_id`: Unique identifier for the message
|
||||
- `timestamp_utc`: When the message was sent
|
||||
- `author_id`: Author's Discord ID
|
||||
- `author_name`: Author's username
|
||||
- `author_nickname`: Author's server nickname
|
||||
- `content`: The message content
|
||||
- `attachment_urls`: Any attached files
|
||||
- `embeds`: Embedded content
|
||||
- `content_embedding`: Vector embedding of the message content (as a string representation of a list)
|
||||
|
||||
## Visualization Options
|
||||
|
||||
- **PCA**: Principal Component Analysis - faster, good for getting an overview
|
||||
- **t-SNE**: t-Distributed Stochastic Neighbor Embedding - slower but may reveal better clusters
|
||||
|
||||
## Controls
|
||||
|
||||
- **Dimension Reduction Method**: Choose between PCA and t-SNE
|
||||
- **Filter by Source Files**: Select which chat log files to include
|
||||
- **Filter by Authors**: Select which authors to display
|
||||
- **Show Data Table**: View the underlying data in table format
|
||||
|
||||
## Performance Notes
|
||||
|
||||
- For large datasets, consider filtering by authors or source files to improve performance
|
||||
- t-SNE is computationally intensive and may take longer with large datasets
|
||||
- The app caches data and computations for better performance
|
||||
5
apps/cluster_map/requirements.txt
Normal file
5
apps/cluster_map/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
streamlit>=1.28.0
|
||||
pandas>=1.5.0
|
||||
numpy>=1.24.0
|
||||
plotly>=5.15.0
|
||||
scikit-learn>=1.3.0
|
||||
233
apps/cluster_map/streamlit_app.py
Normal file
233
apps/cluster_map/streamlit_app.py
Normal file
@@ -0,0 +1,233 @@
|
||||
import streamlit as st
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import plotly.express as px
|
||||
import plotly.graph_objects as go
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.manifold import TSNE
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
import ast
|
||||
|
||||
# Set page config
|
||||
st.set_page_config(
|
||||
page_title="Discord Chat Embeddings Visualizer",
|
||||
page_icon="🗨️",
|
||||
layout="wide"
|
||||
)
|
||||
|
||||
# Title and description
|
||||
st.title("🗨️ Discord Chat Embeddings Visualizer")
|
||||
st.markdown("Explore Discord chat messages through their vector embeddings in 2D space")
|
||||
|
||||
@st.cache_data
|
||||
def load_all_chat_data():
|
||||
"""Load all CSV files from the discord_chat_logs folder"""
|
||||
chat_logs_path = Path("../../discord_chat_logs")
|
||||
|
||||
# Display the path for debugging
|
||||
st.write(f"Looking for CSV files in: {chat_logs_path}")
|
||||
st.write(f"Path exists: {chat_logs_path.exists()}")
|
||||
|
||||
all_data = []
|
||||
|
||||
for csv_file in chat_logs_path.glob("*.csv"):
|
||||
try:
|
||||
df = pd.read_csv(csv_file)
|
||||
df['source_file'] = csv_file.stem # Add source file name
|
||||
all_data.append(df)
|
||||
st.write(f"✅ Loaded {len(df)} messages from {csv_file.name}")
|
||||
except Exception as e:
|
||||
st.error(f"❌ Error loading {csv_file.name}: {e}")
|
||||
|
||||
if all_data:
|
||||
combined_df = pd.concat(all_data, ignore_index=True)
|
||||
st.success(f"🎉 Successfully loaded {len(combined_df)} total messages from {len(all_data)} files")
|
||||
return combined_df
|
||||
else:
|
||||
st.error("No data loaded!")
|
||||
return pd.DataFrame()
|
||||
|
||||
@st.cache_data
|
||||
def parse_embeddings(df):
|
||||
"""Parse the content_embedding column from string to numpy array"""
|
||||
embeddings = []
|
||||
valid_indices = []
|
||||
|
||||
for idx, embedding_str in enumerate(df['content_embedding']):
|
||||
try:
|
||||
# Parse the string representation of the list
|
||||
embedding = ast.literal_eval(embedding_str)
|
||||
if isinstance(embedding, list) and len(embedding) > 0:
|
||||
embeddings.append(embedding)
|
||||
valid_indices.append(idx)
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
embeddings_array = np.array(embeddings)
|
||||
valid_df = df.iloc[valid_indices].copy()
|
||||
|
||||
st.info(f"📊 Parsed {len(embeddings)} valid embeddings from {len(df)} messages")
|
||||
st.info(f"🔢 Embedding dimension: {embeddings_array.shape[1] if len(embeddings) > 0 else 0}")
|
||||
|
||||
return embeddings_array, valid_df
|
||||
|
||||
@st.cache_data
|
||||
def reduce_dimensions(embeddings, method="PCA", n_components=2):
|
||||
"""Reduce embeddings to 2D using PCA or t-SNE"""
|
||||
if method == "PCA":
|
||||
reducer = PCA(n_components=n_components, random_state=42)
|
||||
elif method == "t-SNE":
|
||||
reducer = TSNE(n_components=n_components, random_state=42, perplexity=min(30, len(embeddings)-1))
|
||||
|
||||
reduced_embeddings = reducer.fit_transform(embeddings)
|
||||
return reduced_embeddings
|
||||
|
||||
def create_hover_text(df):
|
||||
"""Create hover text for plotly"""
|
||||
hover_text = []
|
||||
for _, row in df.iterrows():
|
||||
text = f"<b>Author:</b> {row['author_name']}<br>"
|
||||
text += f"<b>Timestamp:</b> {row['timestamp_utc']}<br>"
|
||||
text += f"<b>Source:</b> {row['source_file']}<br>"
|
||||
|
||||
# Handle potential NaN or non-string content
|
||||
content = row['content']
|
||||
if pd.isna(content) or content is None:
|
||||
content_text = "[No content]"
|
||||
else:
|
||||
content_str = str(content)
|
||||
content_text = content_str[:200] + ('...' if len(content_str) > 200 else '')
|
||||
|
||||
text += f"<b>Content:</b> {content_text}"
|
||||
hover_text.append(text)
|
||||
return hover_text
|
||||
|
||||
def main():
|
||||
# Load data
|
||||
with st.spinner("Loading chat data..."):
|
||||
df = load_all_chat_data()
|
||||
|
||||
if df.empty:
|
||||
st.stop()
|
||||
|
||||
# Parse embeddings
|
||||
with st.spinner("Parsing embeddings..."):
|
||||
embeddings, valid_df = parse_embeddings(df)
|
||||
|
||||
if len(embeddings) == 0:
|
||||
st.error("No valid embeddings found!")
|
||||
st.stop()
|
||||
|
||||
# Sidebar controls
|
||||
st.sidebar.header("🎛️ Visualization Controls")
|
||||
|
||||
# Dimension reduction method
|
||||
method = st.sidebar.selectbox(
|
||||
"Dimension Reduction Method",
|
||||
["PCA", "t-SNE"],
|
||||
help="PCA is faster, t-SNE may reveal better clusters"
|
||||
)
|
||||
|
||||
# Source file filter
|
||||
source_files = valid_df['source_file'].unique()
|
||||
selected_sources = st.sidebar.multiselect(
|
||||
"Filter by Source Files",
|
||||
source_files,
|
||||
default=source_files,
|
||||
help="Select which chat log files to include"
|
||||
)
|
||||
|
||||
# Author filter
|
||||
authors = valid_df['author_name'].unique()
|
||||
selected_authors = st.sidebar.multiselect(
|
||||
"Filter by Authors",
|
||||
authors,
|
||||
default=authors[:10] if len(authors) > 10 else authors, # Limit to first 10 for performance
|
||||
help="Select which authors to include"
|
||||
)
|
||||
|
||||
# Filter data
|
||||
filtered_df = valid_df[
|
||||
(valid_df['source_file'].isin(selected_sources)) &
|
||||
(valid_df['author_name'].isin(selected_authors))
|
||||
]
|
||||
|
||||
if filtered_df.empty:
|
||||
st.warning("No data matches the current filters!")
|
||||
st.stop()
|
||||
|
||||
# Get corresponding embeddings
|
||||
filtered_indices = filtered_df.index.tolist()
|
||||
filtered_embeddings = embeddings[[i for i, idx in enumerate(valid_df.index) if idx in filtered_indices]]
|
||||
|
||||
st.info(f"📈 Visualizing {len(filtered_df)} messages")
|
||||
|
||||
# Reduce dimensions
|
||||
with st.spinner(f"Reducing dimensions using {method}..."):
|
||||
reduced_embeddings = reduce_dimensions(filtered_embeddings, method)
|
||||
|
||||
# Create hover text
|
||||
hover_text = create_hover_text(filtered_df)
|
||||
|
||||
# Create the plot
|
||||
fig = go.Figure()
|
||||
|
||||
# Color by source file
|
||||
colors = px.colors.qualitative.Set1
|
||||
for i, source in enumerate(selected_sources):
|
||||
source_mask = filtered_df['source_file'] == source
|
||||
if source_mask.any():
|
||||
source_data = filtered_df[source_mask]
|
||||
source_embeddings = reduced_embeddings[source_mask]
|
||||
source_hover = [hover_text[j] for j, mask in enumerate(source_mask) if mask]
|
||||
|
||||
fig.add_trace(go.Scatter(
|
||||
x=source_embeddings[:, 0],
|
||||
y=source_embeddings[:, 1],
|
||||
mode='markers',
|
||||
name=source,
|
||||
marker=dict(
|
||||
size=8,
|
||||
color=colors[i % len(colors)],
|
||||
opacity=0.7,
|
||||
line=dict(width=1, color='white')
|
||||
),
|
||||
hovertemplate='%{hovertext}<extra></extra>',
|
||||
hovertext=source_hover
|
||||
))
|
||||
|
||||
fig.update_layout(
|
||||
title=f"Discord Chat Messages - {method} Visualization",
|
||||
xaxis_title=f"{method} Component 1",
|
||||
yaxis_title=f"{method} Component 2",
|
||||
hovermode='closest',
|
||||
width=1000,
|
||||
height=700
|
||||
)
|
||||
|
||||
# Display the plot
|
||||
st.plotly_chart(fig, use_container_width=True)
|
||||
|
||||
# Statistics
|
||||
col1, col2, col3 = st.columns(3)
|
||||
|
||||
with col1:
|
||||
st.metric("Total Messages", len(filtered_df))
|
||||
|
||||
with col2:
|
||||
st.metric("Unique Authors", filtered_df['author_name'].nunique())
|
||||
|
||||
with col3:
|
||||
st.metric("Source Files", len(selected_sources))
|
||||
|
||||
# Show data table
|
||||
if st.checkbox("Show Data Table"):
|
||||
st.subheader("📋 Message Data")
|
||||
display_df = filtered_df[['timestamp_utc', 'author_name', 'source_file', 'content']].copy()
|
||||
display_df['content'] = display_df['content'].str[:100] + '...' # Truncate for display
|
||||
st.dataframe(display_df, use_container_width=True)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user