csv streamlit viewer with geospatial integrations

2025-09-25 21:01:28 +01:00
parent be20ba8c41
commit d9ec1405d7
1 changed files with 167 additions and 0 deletions
--- a/streamlit_app/app.py
+++ b/streamlit_app/app.py
@@ -0,0 +1,167 @@
 import streamlit as st
 import pandas as pd
 import os
 from pathlib import Path
 import folium
 from streamlit_folium import folium_static
 import numpy as np
 st.set_page_config(page_title="Timeline CSV Viewer", layout="wide")
 st.title("Timeline CSV Viewer")
 # Path to the timeline_csv folder
 timeline_csv_path = Path("../timeline_csv")
 # Get all CSV files from all subdirectories
 csv_files = []
 if timeline_csv_path.exists():
    for subdir in timeline_csv_path.iterdir():
        if subdir.is_dir():
            for csv_file in subdir.glob("*.csv"):
                csv_files.append(csv_file)
 if not csv_files:
    st.error("No CSV files found in the timeline_csv folder.")
    st.stop()
 # Define geospatial datasets and their coordinate columns
 GEOSPATIAL_FILES = {
    'timeline_path_points.csv': {'lat': 'lat', 'lon': 'lon', 'time': 'time'},
    'visits.csv': {'lat': 'top_lat', 'lon': 'top_lon', 'time': 'startTime'},
    'raw_signals.csv': {'lat': 'lat', 'lon': 'lon', 'time': 'timestamp'},
    'frequent_places.csv': {'lat': 'lat', 'lon': 'lon', 'time': None},
    'semantic_segments.csv': {'lat': None, 'lon': None, 'time': 'startTime'}
 }
 # Create enhanced file names with geospatial indicators
 enhanced_file_names = []
 for f in csv_files:
    if f.name in GEOSPATIAL_FILES:
        enhanced_file_names.append(f"🗺️ {f.name} (Geospatial)")
    else:
        enhanced_file_names.append(f.name)
 selected_enhanced_name = st.selectbox("Select a CSV file to view:", enhanced_file_names)
 # Extract the actual filename from the enhanced name
 selected_file_name = selected_enhanced_name.replace('🗺️ ', '').replace(' (Geospatial)', '')
 # Find the full path for the selected file
 selected_file_path = None
 for file_path in csv_files:
    if file_path.name == selected_file_name:
        selected_file_path = file_path
        break
 if selected_file_path:
    st.write(f"**File:** {selected_file_path}")
    try:
        # Read the CSV file
        df = pd.read_csv(selected_file_path)
        # Display basic info
        is_geospatial = selected_file_name in GEOSPATIAL_FILES
        if is_geospatial:
            st.success(f"🗺️ **Geospatial Dataset Detected** - {df.shape[0]} rows × {df.shape[1]} columns")
        else:
            st.write(f"**Shape:** {df.shape[0]} rows × {df.shape[1]} columns")
        # Show geospatial visualization if applicable
        if is_geospatial and selected_file_name in GEOSPATIAL_FILES:
            geo_config = GEOSPATIAL_FILES[selected_file_name]
            lat_col = geo_config['lat']
            lon_col = geo_config['lon']
            time_col = geo_config['time']
            if lat_col and lon_col and lat_col in df.columns and lon_col in df.columns:
                st.subheader("🗺️ Map Visualization")
                # Filter out null coordinates
                geo_df = df.dropna(subset=[lat_col, lon_col])
                if len(geo_df) > 0:
                    # Sample data if too large for performance
                    if len(geo_df) > 1000:
                        geo_df = geo_df.sample(n=1000)
                        st.info(f"Showing 1000 randomly sampled points out of {len(df)} total points for performance")
                    # Create map centered on mean coordinates
                    center_lat = geo_df[lat_col].mean()
                    center_lon = geo_df[lon_col].mean()
                    m = folium.Map(location=[center_lat, center_lon], zoom_start=10)
                    # Add points to map
                    for idx, row in geo_df.iterrows():
                        popup_text = f"Index: {idx}"
                        if time_col and time_col in df.columns:
                            popup_text += f"<br>Time: {row[time_col]}"
                        # Color code based on dataset type
                        if selected_file_name == 'timeline_path_points.csv':
                            color = 'blue'
                        elif selected_file_name == 'visits.csv':
                            color = 'red'
                        elif selected_file_name == 'raw_signals.csv':
                            color = 'green'
                        else:
                            color = 'orange'
                        folium.CircleMarker(
                            location=[row[lat_col], row[lon_col]],
                            radius=3,
                            popup=popup_text,
                            color=color,
                            fillColor=color,
                            fillOpacity=0.7
                        ).add_to(m)
                    folium_static(m)
                    # Show coordinate statistics
                    st.subheader("📍 Coordinate Statistics")
                    coord_stats = pd.DataFrame({
                        'Statistic': ['Count', 'Min Lat', 'Max Lat', 'Min Lon', 'Max Lon', 'Center Lat', 'Center Lon'],
                        'Value': [
                            len(geo_df),
                            f"{geo_df[lat_col].min():.6f}",
                            f"{geo_df[lat_col].max():.6f}",
                            f"{geo_df[lon_col].min():.6f}",
                            f"{geo_df[lon_col].max():.6f}",
                            f"{center_lat:.6f}",
                            f"{center_lon:.6f}"
                        ]
                    })
                    st.dataframe(coord_stats)
                else:
                    st.warning("No valid coordinates found in this dataset")
            else:
                if selected_file_name == 'semantic_segments.csv':
                    st.info("📅 This dataset contains temporal data that links to spatial information in other datasets")
                else:
                    st.warning(f"Expected coordinate columns ({lat_col}, {lon_col}) not found in this dataset")
        # Show first few rows
        st.subheader("Data Preview")
        st.dataframe(df.head(100))
        # Show column info
        st.subheader("Column Information")
        col_info = pd.DataFrame({
            'Column': df.columns,
            'Data Type': df.dtypes,
            'Non-Null Count': df.count(),
            'Null Count': df.isnull().sum()
        })
        st.dataframe(col_info)
        # Show basic statistics for numeric columns
        numeric_cols = df.select_dtypes(include=['number']).columns
        if len(numeric_cols) > 0:
            st.subheader("Numeric Column Statistics")
            st.dataframe(df[numeric_cols].describe())
    except Exception as e:
        st.error(f"Error reading the CSV file: {str(e)}")