diff --git a/apps/cluster_map/README.md b/apps/cluster_map/README.md
new file mode 100644
index 0000000..8edadd2
--- /dev/null
+++ b/apps/cluster_map/README.md
@@ -0,0 +1,58 @@
+# Discord Chat Embeddings Visualizer
+
+A Streamlit application that visualizes Discord chat messages using their vector embeddings in 2D space.
+
+## Features
+
+- **2D Visualization**: View chat messages plotted using PCA or t-SNE dimension reduction
+- **Interactive Plotting**: Hover over points to see message content, author, and timestamp
+- **Filtering**: Filter by source chat log files and authors
+- **Multiple Datasets**: Automatically loads all CSV files from the discord_chat_logs folder
+
+## Installation
+
+1. Install the required dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+## Usage
+
+Run the Streamlit application:
+
+```bash
+streamlit run streamlit_app.py
+```
+
+The app will automatically load all CSV files from the `../../discord_chat_logs/` directory.
+
+## Data Format
+
+The application expects CSV files with the following columns:
+- `message_id`: Unique identifier for the message
+- `timestamp_utc`: When the message was sent
+- `author_id`: Author's Discord ID
+- `author_name`: Author's username
+- `author_nickname`: Author's server nickname
+- `content`: The message content
+- `attachment_urls`: Any attached files
+- `embeds`: Embedded content
+- `content_embedding`: Vector embedding of the message content (as a string representation of a list)
+
+## Visualization Options
+
+- **PCA**: Principal Component Analysis - faster, good for getting an overview
+- **t-SNE**: t-Distributed Stochastic Neighbor Embedding - slower but may reveal better clusters
+
+## Controls
+
+- **Dimension Reduction Method**: Choose between PCA and t-SNE
+- **Filter by Source Files**: Select which chat log files to include
+- **Filter by Authors**: Select which authors to display
+- **Show Data Table**: View the underlying data in table format
+
+## Performance Notes
+
+- For large datasets, consider filtering by authors or source files to improve performance
+- t-SNE is computationally intensive and may take longer with large datasets
+- The app caches data and computations for better performance
diff --git a/apps/cluster_map/requirements.txt b/apps/cluster_map/requirements.txt
new file mode 100644
index 0000000..0ae2aa3
--- /dev/null
+++ b/apps/cluster_map/requirements.txt
@@ -0,0 +1,5 @@
+streamlit>=1.28.0
+pandas>=1.5.0
+numpy>=1.24.0
+plotly>=5.15.0
+scikit-learn>=1.3.0
diff --git a/apps/cluster_map/streamlit_app.py b/apps/cluster_map/streamlit_app.py
new file mode 100644
index 0000000..7f3d0eb
--- /dev/null
+++ b/apps/cluster_map/streamlit_app.py
@@ -0,0 +1,233 @@
+import streamlit as st
+import pandas as pd
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from sklearn.decomposition import PCA
+from sklearn.manifold import TSNE
+import json
+import os
+from pathlib import Path
+import ast
+
+# Set page config
+st.set_page_config(
+ page_title="Discord Chat Embeddings Visualizer",
+ page_icon="🗨️",
+ layout="wide"
+)
+
+# Title and description
+st.title("🗨️ Discord Chat Embeddings Visualizer")
+st.markdown("Explore Discord chat messages through their vector embeddings in 2D space")
+
+@st.cache_data
+def load_all_chat_data():
+ """Load all CSV files from the discord_chat_logs folder"""
+ chat_logs_path = Path("../../discord_chat_logs")
+
+ # Display the path for debugging
+ st.write(f"Looking for CSV files in: {chat_logs_path}")
+ st.write(f"Path exists: {chat_logs_path.exists()}")
+
+ all_data = []
+
+ for csv_file in chat_logs_path.glob("*.csv"):
+ try:
+ df = pd.read_csv(csv_file)
+ df['source_file'] = csv_file.stem # Add source file name
+ all_data.append(df)
+ st.write(f"✅ Loaded {len(df)} messages from {csv_file.name}")
+ except Exception as e:
+ st.error(f"❌ Error loading {csv_file.name}: {e}")
+
+ if all_data:
+ combined_df = pd.concat(all_data, ignore_index=True)
+ st.success(f"🎉 Successfully loaded {len(combined_df)} total messages from {len(all_data)} files")
+ return combined_df
+ else:
+ st.error("No data loaded!")
+ return pd.DataFrame()
+
+@st.cache_data
+def parse_embeddings(df):
+ """Parse the content_embedding column from string to numpy array"""
+ embeddings = []
+ valid_indices = []
+
+ for idx, embedding_str in enumerate(df['content_embedding']):
+ try:
+ # Parse the string representation of the list
+ embedding = ast.literal_eval(embedding_str)
+ if isinstance(embedding, list) and len(embedding) > 0:
+ embeddings.append(embedding)
+ valid_indices.append(idx)
+ except Exception as e:
+ continue
+
+ embeddings_array = np.array(embeddings)
+ valid_df = df.iloc[valid_indices].copy()
+
+ st.info(f"📊 Parsed {len(embeddings)} valid embeddings from {len(df)} messages")
+ st.info(f"🔢 Embedding dimension: {embeddings_array.shape[1] if len(embeddings) > 0 else 0}")
+
+ return embeddings_array, valid_df
+
+@st.cache_data
+def reduce_dimensions(embeddings, method="PCA", n_components=2):
+ """Reduce embeddings to 2D using PCA or t-SNE"""
+ if method == "PCA":
+ reducer = PCA(n_components=n_components, random_state=42)
+ elif method == "t-SNE":
+ reducer = TSNE(n_components=n_components, random_state=42, perplexity=min(30, len(embeddings)-1))
+
+ reduced_embeddings = reducer.fit_transform(embeddings)
+ return reduced_embeddings
+
+def create_hover_text(df):
+ """Create hover text for plotly"""
+ hover_text = []
+ for _, row in df.iterrows():
+ text = f"Author: {row['author_name']}
"
+ text += f"Timestamp: {row['timestamp_utc']}
"
+ text += f"Source: {row['source_file']}
"
+
+ # Handle potential NaN or non-string content
+ content = row['content']
+ if pd.isna(content) or content is None:
+ content_text = "[No content]"
+ else:
+ content_str = str(content)
+ content_text = content_str[:200] + ('...' if len(content_str) > 200 else '')
+
+ text += f"Content: {content_text}"
+ hover_text.append(text)
+ return hover_text
+
+def main():
+ # Load data
+ with st.spinner("Loading chat data..."):
+ df = load_all_chat_data()
+
+ if df.empty:
+ st.stop()
+
+ # Parse embeddings
+ with st.spinner("Parsing embeddings..."):
+ embeddings, valid_df = parse_embeddings(df)
+
+ if len(embeddings) == 0:
+ st.error("No valid embeddings found!")
+ st.stop()
+
+ # Sidebar controls
+ st.sidebar.header("🎛️ Visualization Controls")
+
+ # Dimension reduction method
+ method = st.sidebar.selectbox(
+ "Dimension Reduction Method",
+ ["PCA", "t-SNE"],
+ help="PCA is faster, t-SNE may reveal better clusters"
+ )
+
+ # Source file filter
+ source_files = valid_df['source_file'].unique()
+ selected_sources = st.sidebar.multiselect(
+ "Filter by Source Files",
+ source_files,
+ default=source_files,
+ help="Select which chat log files to include"
+ )
+
+ # Author filter
+ authors = valid_df['author_name'].unique()
+ selected_authors = st.sidebar.multiselect(
+ "Filter by Authors",
+ authors,
+ default=authors[:10] if len(authors) > 10 else authors, # Limit to first 10 for performance
+ help="Select which authors to include"
+ )
+
+ # Filter data
+ filtered_df = valid_df[
+ (valid_df['source_file'].isin(selected_sources)) &
+ (valid_df['author_name'].isin(selected_authors))
+ ]
+
+ if filtered_df.empty:
+ st.warning("No data matches the current filters!")
+ st.stop()
+
+ # Get corresponding embeddings
+ filtered_indices = filtered_df.index.tolist()
+ filtered_embeddings = embeddings[[i for i, idx in enumerate(valid_df.index) if idx in filtered_indices]]
+
+ st.info(f"📈 Visualizing {len(filtered_df)} messages")
+
+ # Reduce dimensions
+ with st.spinner(f"Reducing dimensions using {method}..."):
+ reduced_embeddings = reduce_dimensions(filtered_embeddings, method)
+
+ # Create hover text
+ hover_text = create_hover_text(filtered_df)
+
+ # Create the plot
+ fig = go.Figure()
+
+ # Color by source file
+ colors = px.colors.qualitative.Set1
+ for i, source in enumerate(selected_sources):
+ source_mask = filtered_df['source_file'] == source
+ if source_mask.any():
+ source_data = filtered_df[source_mask]
+ source_embeddings = reduced_embeddings[source_mask]
+ source_hover = [hover_text[j] for j, mask in enumerate(source_mask) if mask]
+
+ fig.add_trace(go.Scatter(
+ x=source_embeddings[:, 0],
+ y=source_embeddings[:, 1],
+ mode='markers',
+ name=source,
+ marker=dict(
+ size=8,
+ color=colors[i % len(colors)],
+ opacity=0.7,
+ line=dict(width=1, color='white')
+ ),
+ hovertemplate='%{hovertext}',
+ hovertext=source_hover
+ ))
+
+ fig.update_layout(
+ title=f"Discord Chat Messages - {method} Visualization",
+ xaxis_title=f"{method} Component 1",
+ yaxis_title=f"{method} Component 2",
+ hovermode='closest',
+ width=1000,
+ height=700
+ )
+
+ # Display the plot
+ st.plotly_chart(fig, use_container_width=True)
+
+ # Statistics
+ col1, col2, col3 = st.columns(3)
+
+ with col1:
+ st.metric("Total Messages", len(filtered_df))
+
+ with col2:
+ st.metric("Unique Authors", filtered_df['author_name'].nunique())
+
+ with col3:
+ st.metric("Source Files", len(selected_sources))
+
+ # Show data table
+ if st.checkbox("Show Data Table"):
+ st.subheader("📋 Message Data")
+ display_df = filtered_df[['timestamp_utc', 'author_name', 'source_file', 'content']].copy()
+ display_df['content'] = display_df['content'].str[:100] + '...' # Truncate for display
+ st.dataframe(display_df, use_container_width=True)
+
+if __name__ == "__main__":
+ main()