personal-tracker/apps/streamlit_data_viewer/app.py

import streamlit as st
import pandas as pd
import os
from pathlib import Path
import folium
from streamlit_folium import folium_static
import numpy as np

st.set_page_config(page_title="Timeline CSV Viewer", layout="wide")

st.title("Timeline CSV Viewer")

# Path to the timeline_csv folder
timeline_csv_path = Path("../timeline_csv")

# Get all CSV files from all subdirectories
csv_files = []
if timeline_csv_path.exists():
    for subdir in timeline_csv_path.iterdir():
        if subdir.is_dir():
            for csv_file in subdir.glob("*.csv"):
                csv_files.append(csv_file)

if not csv_files:
    st.error("No CSV files found in the timeline_csv folder.")
    st.stop()

# Define geospatial datasets and their coordinate columns
GEOSPATIAL_FILES = {
    'timeline_path_points.csv': {'lat': 'lat', 'lon': 'lon', 'time': 'time'},
    'visits.csv': {'lat': 'top_lat', 'lon': 'top_lon', 'time': 'startTime'},
    'raw_signals.csv': {'lat': 'lat', 'lon': 'lon', 'time': 'timestamp'},
    'frequent_places.csv': {'lat': 'lat', 'lon': 'lon', 'time': None},
    'semantic_segments.csv': {'lat': None, 'lon': None, 'time': 'startTime'}
}

# Create enhanced file names with geospatial indicators
enhanced_file_names = []
for f in csv_files:
    if f.name in GEOSPATIAL_FILES:
        enhanced_file_names.append(f"🗺️ {f.name} (Geospatial)")
    else:
        enhanced_file_names.append(f.name)

selected_enhanced_name = st.selectbox("Select a CSV file to view:", enhanced_file_names)

# Extract the actual filename from the enhanced name
selected_file_name = selected_enhanced_name.replace('🗺️ ', '').replace(' (Geospatial)', '')

# Find the full path for the selected file
selected_file_path = None
for file_path in csv_files:
    if file_path.name == selected_file_name:
        selected_file_path = file_path
        break

if selected_file_path:
    st.write(f"**File:** {selected_file_path}")

    try:
        # Read the CSV file
        df = pd.read_csv(selected_file_path)

        # Display basic info
        is_geospatial = selected_file_name in GEOSPATIAL_FILES
        if is_geospatial:
            st.success(f"🗺️ **Geospatial Dataset Detected** - {df.shape[0]} rows × {df.shape[1]} columns")
        else:
            st.write(f"**Shape:** {df.shape[0]} rows × {df.shape[1]} columns")

        # Show geospatial visualization if applicable
        if is_geospatial and selected_file_name in GEOSPATIAL_FILES:
            geo_config = GEOSPATIAL_FILES[selected_file_name]
            lat_col = geo_config['lat']
            lon_col = geo_config['lon']
            time_col = geo_config['time']

            if lat_col and lon_col and lat_col in df.columns and lon_col in df.columns:
                st.subheader("🗺️ Map Visualization")

                # Filter out null coordinates
                geo_df = df.dropna(subset=[lat_col, lon_col])

                if len(geo_df) > 0:
                    # Sample data if too large for performance
                    if len(geo_df) > 1000:
                        geo_df = geo_df.sample(n=1000)
                        st.info(f"Showing 1000 randomly sampled points out of {len(df)} total points for performance")

                    # Create map centered on mean coordinates
                    center_lat = geo_df[lat_col].mean()
                    center_lon = geo_df[lon_col].mean()

                    m = folium.Map(location=[center_lat, center_lon], zoom_start=10)

                    # Add points to map
                    for idx, row in geo_df.iterrows():
                        popup_text = f"Index: {idx}"
                        if time_col and time_col in df.columns:
                            popup_text += f"<br>Time: {row[time_col]}"

                        # Color code based on dataset type
                        if selected_file_name == 'timeline_path_points.csv':
                            color = 'blue'
                        elif selected_file_name == 'visits.csv':
                            color = 'red'
                        elif selected_file_name == 'raw_signals.csv':
                            color = 'green'
                        else:
                            color = 'orange'

                        folium.CircleMarker(
                            location=[row[lat_col], row[lon_col]],
                            radius=3,
                            popup=popup_text,
                            color=color,
                            fillColor=color,
                            fillOpacity=0.7
                        ).add_to(m)

                    folium_static(m)

                    # Show coordinate statistics
                    st.subheader("📍 Coordinate Statistics")
                    coord_stats = pd.DataFrame({
                        'Statistic': ['Count', 'Min Lat', 'Max Lat', 'Min Lon', 'Max Lon', 'Center Lat', 'Center Lon'],
                        'Value': [
                            len(geo_df),
                            f"{geo_df[lat_col].min():.6f}",
                            f"{geo_df[lat_col].max():.6f}",
                            f"{geo_df[lon_col].min():.6f}",
                            f"{geo_df[lon_col].max():.6f}",
                            f"{center_lat:.6f}",
                            f"{center_lon:.6f}"
                        ]
                    })
                    st.dataframe(coord_stats)
                else:
                    st.warning("No valid coordinates found in this dataset")
            else:
                if selected_file_name == 'semantic_segments.csv':
                    st.info("📅 This dataset contains temporal data that links to spatial information in other datasets")
                else:
                    st.warning(f"Expected coordinate columns ({lat_col}, {lon_col}) not found in this dataset")

        # Show first few rows
        st.subheader("Data Preview")
        st.dataframe(df.head(100))

        # Show column info
        st.subheader("Column Information")
        col_info = pd.DataFrame({
            'Column': df.columns,
            'Data Type': df.dtypes,
            'Non-Null Count': df.count(),
            'Null Count': df.isnull().sum()
        })
        st.dataframe(col_info)

        # Show basic statistics for numeric columns
        numeric_cols = df.select_dtypes(include=['number']).columns
        if len(numeric_cols) > 0:
            st.subheader("Numeric Column Statistics")
            st.dataframe(df[numeric_cols].describe())

    except Exception as e:
        st.error(f"Error reading the CSV file: {str(e)}")