167 lines
6.5 KiB
Python
167 lines
6.5 KiB
Python
import streamlit as st
|
||
import pandas as pd
|
||
import os
|
||
from pathlib import Path
|
||
import folium
|
||
from streamlit_folium import folium_static
|
||
import numpy as np
|
||
|
||
st.set_page_config(page_title="Timeline CSV Viewer", layout="wide")
|
||
|
||
st.title("Timeline CSV Viewer")
|
||
|
||
# Path to the timeline_csv folder
|
||
timeline_csv_path = Path("../timeline_csv")
|
||
|
||
# Get all CSV files from all subdirectories
|
||
csv_files = []
|
||
if timeline_csv_path.exists():
|
||
for subdir in timeline_csv_path.iterdir():
|
||
if subdir.is_dir():
|
||
for csv_file in subdir.glob("*.csv"):
|
||
csv_files.append(csv_file)
|
||
|
||
if not csv_files:
|
||
st.error("No CSV files found in the timeline_csv folder.")
|
||
st.stop()
|
||
|
||
# Define geospatial datasets and their coordinate columns
|
||
GEOSPATIAL_FILES = {
|
||
'timeline_path_points.csv': {'lat': 'lat', 'lon': 'lon', 'time': 'time'},
|
||
'visits.csv': {'lat': 'top_lat', 'lon': 'top_lon', 'time': 'startTime'},
|
||
'raw_signals.csv': {'lat': 'lat', 'lon': 'lon', 'time': 'timestamp'},
|
||
'frequent_places.csv': {'lat': 'lat', 'lon': 'lon', 'time': None},
|
||
'semantic_segments.csv': {'lat': None, 'lon': None, 'time': 'startTime'}
|
||
}
|
||
|
||
# Create enhanced file names with geospatial indicators
|
||
enhanced_file_names = []
|
||
for f in csv_files:
|
||
if f.name in GEOSPATIAL_FILES:
|
||
enhanced_file_names.append(f"🗺️ {f.name} (Geospatial)")
|
||
else:
|
||
enhanced_file_names.append(f.name)
|
||
|
||
selected_enhanced_name = st.selectbox("Select a CSV file to view:", enhanced_file_names)
|
||
|
||
# Extract the actual filename from the enhanced name
|
||
selected_file_name = selected_enhanced_name.replace('🗺️ ', '').replace(' (Geospatial)', '')
|
||
|
||
# Find the full path for the selected file
|
||
selected_file_path = None
|
||
for file_path in csv_files:
|
||
if file_path.name == selected_file_name:
|
||
selected_file_path = file_path
|
||
break
|
||
|
||
if selected_file_path:
|
||
st.write(f"**File:** {selected_file_path}")
|
||
|
||
try:
|
||
# Read the CSV file
|
||
df = pd.read_csv(selected_file_path)
|
||
|
||
# Display basic info
|
||
is_geospatial = selected_file_name in GEOSPATIAL_FILES
|
||
if is_geospatial:
|
||
st.success(f"🗺️ **Geospatial Dataset Detected** - {df.shape[0]} rows × {df.shape[1]} columns")
|
||
else:
|
||
st.write(f"**Shape:** {df.shape[0]} rows × {df.shape[1]} columns")
|
||
|
||
# Show geospatial visualization if applicable
|
||
if is_geospatial and selected_file_name in GEOSPATIAL_FILES:
|
||
geo_config = GEOSPATIAL_FILES[selected_file_name]
|
||
lat_col = geo_config['lat']
|
||
lon_col = geo_config['lon']
|
||
time_col = geo_config['time']
|
||
|
||
if lat_col and lon_col and lat_col in df.columns and lon_col in df.columns:
|
||
st.subheader("🗺️ Map Visualization")
|
||
|
||
# Filter out null coordinates
|
||
geo_df = df.dropna(subset=[lat_col, lon_col])
|
||
|
||
if len(geo_df) > 0:
|
||
# Sample data if too large for performance
|
||
if len(geo_df) > 1000:
|
||
geo_df = geo_df.sample(n=1000)
|
||
st.info(f"Showing 1000 randomly sampled points out of {len(df)} total points for performance")
|
||
|
||
# Create map centered on mean coordinates
|
||
center_lat = geo_df[lat_col].mean()
|
||
center_lon = geo_df[lon_col].mean()
|
||
|
||
m = folium.Map(location=[center_lat, center_lon], zoom_start=10)
|
||
|
||
# Add points to map
|
||
for idx, row in geo_df.iterrows():
|
||
popup_text = f"Index: {idx}"
|
||
if time_col and time_col in df.columns:
|
||
popup_text += f"<br>Time: {row[time_col]}"
|
||
|
||
# Color code based on dataset type
|
||
if selected_file_name == 'timeline_path_points.csv':
|
||
color = 'blue'
|
||
elif selected_file_name == 'visits.csv':
|
||
color = 'red'
|
||
elif selected_file_name == 'raw_signals.csv':
|
||
color = 'green'
|
||
else:
|
||
color = 'orange'
|
||
|
||
folium.CircleMarker(
|
||
location=[row[lat_col], row[lon_col]],
|
||
radius=3,
|
||
popup=popup_text,
|
||
color=color,
|
||
fillColor=color,
|
||
fillOpacity=0.7
|
||
).add_to(m)
|
||
|
||
folium_static(m)
|
||
|
||
# Show coordinate statistics
|
||
st.subheader("📍 Coordinate Statistics")
|
||
coord_stats = pd.DataFrame({
|
||
'Statistic': ['Count', 'Min Lat', 'Max Lat', 'Min Lon', 'Max Lon', 'Center Lat', 'Center Lon'],
|
||
'Value': [
|
||
len(geo_df),
|
||
f"{geo_df[lat_col].min():.6f}",
|
||
f"{geo_df[lat_col].max():.6f}",
|
||
f"{geo_df[lon_col].min():.6f}",
|
||
f"{geo_df[lon_col].max():.6f}",
|
||
f"{center_lat:.6f}",
|
||
f"{center_lon:.6f}"
|
||
]
|
||
})
|
||
st.dataframe(coord_stats)
|
||
else:
|
||
st.warning("No valid coordinates found in this dataset")
|
||
else:
|
||
if selected_file_name == 'semantic_segments.csv':
|
||
st.info("📅 This dataset contains temporal data that links to spatial information in other datasets")
|
||
else:
|
||
st.warning(f"Expected coordinate columns ({lat_col}, {lon_col}) not found in this dataset")
|
||
|
||
# Show first few rows
|
||
st.subheader("Data Preview")
|
||
st.dataframe(df.head(100))
|
||
|
||
# Show column info
|
||
st.subheader("Column Information")
|
||
col_info = pd.DataFrame({
|
||
'Column': df.columns,
|
||
'Data Type': df.dtypes,
|
||
'Non-Null Count': df.count(),
|
||
'Null Count': df.isnull().sum()
|
||
})
|
||
st.dataframe(col_info)
|
||
|
||
# Show basic statistics for numeric columns
|
||
numeric_cols = df.select_dtypes(include=['number']).columns
|
||
if len(numeric_cols) > 0:
|
||
st.subheader("Numeric Column Statistics")
|
||
st.dataframe(df[numeric_cols].describe())
|
||
|
||
except Exception as e:
|
||
st.error(f"Error reading the CSV file: {str(e)}") |