Files
cult-scraper/apps/image_viewer/image_viewer.py
2025-08-11 01:35:14 +01:00

227 lines
7.9 KiB
Python

import streamlit as st
import json
import os
from pathlib import Path
import requests
from PIL import Image
from io import BytesIO
# Set page config
st.set_page_config(
page_title="Image Dataset Viewer",
page_icon="🖼️",
layout="wide"
)
# Cache the dataset loading
@st.cache_data
def load_dataset():
"""Load the images dataset JSON file"""
dataset_path = "../images_dataset/images_dataset.json"
try:
with open(dataset_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
except Exception as e:
st.error(f"Error loading dataset: {e}")
return {}
@st.cache_data
def get_channels(data):
"""Extract unique channels from the dataset"""
# First try to get channels from metadata
if isinstance(data, dict) and 'metadata' in data and 'summary' in data['metadata']:
channels = data['metadata']['summary'].get('channels', [])
if channels:
return sorted(channels)
# Fallback: extract from images array
channels = set()
images = data.get('images', []) if isinstance(data, dict) else []
for item in images:
if isinstance(item, dict) and 'channel' in item:
channels.add(item['channel'])
return sorted(list(channels))
def display_image(image_url, caption="", base64_data=None):
"""Display an image from URL, local path, or base64 data"""
try:
if base64_data and base64_data != "image datta ...........":
# Load image from base64 data
import base64
image_data = base64.b64decode(base64_data)
image = Image.open(BytesIO(image_data))
elif image_url and image_url.startswith(('http://', 'https://')):
# Load image from URL
response = requests.get(image_url, timeout=10)
response.raise_for_status()
image = Image.open(BytesIO(response.content))
elif image_url:
# Load local image
image_path = Path(__file__).parent.parent / image_url
if image_path.exists():
image = Image.open(image_path)
else:
st.error(f"Image not found: {image_url}")
return False
else:
st.error("No valid image source found")
return False
st.image(image, caption=caption, use_column_width=True)
return True
except Exception as e:
st.error(f"Error loading image: {e}")
return False
def main():
st.title("🖼️ Image Dataset Viewer")
st.markdown("Browse images from your dataset by channel")
# Load dataset
with st.spinner("Loading dataset..."):
data = load_dataset()
if not data:
st.error("No data loaded. Please check your dataset file.")
return
# Display dataset summary if available
if isinstance(data, dict) and 'metadata' in data:
metadata = data['metadata']
if 'summary' in metadata:
summary = metadata['summary']
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Total Images", summary.get('total_images', 'Unknown'))
with col2:
st.metric("Channels", len(summary.get('channels', [])))
with col3:
st.metric("Authors", len(summary.get('authors', [])))
with col4:
size_mb = summary.get('total_size_bytes', 0) / (1024 * 1024)
st.metric("Total Size", f"{size_mb:.1f} MB")
# Get channels
channels = get_channels(data)
if not channels:
st.error("No channels found in the dataset.")
return
# Channel selection
selected_channel = st.selectbox(
"Select a channel:",
channels,
help="Choose a channel to view its images"
)
# Filter images by channel
channel_images = []
images = data.get('images', []) if isinstance(data, dict) else []
for i, item in enumerate(images):
if isinstance(item, dict) and item.get('channel') == selected_channel:
if 'url' in item or 'base64_data' in item:
channel_images.append({
'id': i,
'data': item
})
if not channel_images:
st.warning(f"No images found for channel: {selected_channel}")
return
st.success(f"Found {len(channel_images)} images in #{selected_channel}")
# Image navigation
if len(channel_images) > 1:
col1, col2, col3 = st.columns([1, 2, 1])
with col1:
if st.button("⬅️ Previous", use_container_width=True):
if 'image_index' in st.session_state and st.session_state.image_index > 0:
st.session_state.image_index -= 1
else:
st.session_state.image_index = len(channel_images) - 1
with col2:
# Initialize or get current index
if 'image_index' not in st.session_state:
st.session_state.image_index = 0
# Image selector
st.session_state.image_index = st.slider(
"Image",
0,
len(channel_images) - 1,
st.session_state.image_index,
help=f"Navigate through {len(channel_images)} images"
)
with col3:
if st.button("Next ➡️", use_container_width=True):
if 'image_index' in st.session_state and st.session_state.image_index < len(channel_images) - 1:
st.session_state.image_index += 1
else:
st.session_state.image_index = 0
else:
st.session_state.image_index = 0
# Display current image
current_image = channel_images[st.session_state.image_index]
image_data = current_image['data']
# Get image URL and base64 data
image_url = image_data.get('url')
base64_data = image_data.get('base64_data')
if image_url or base64_data:
# Create two columns for image and metadata
col1, col2 = st.columns([2, 1])
with col1:
st.subheader(f"Image {st.session_state.image_index + 1} of {len(channel_images)}")
caption = f"Channel: #{selected_channel}"
if 'author_name' in image_data:
caption += f" | Author: {image_data['author_name']}"
if 'timestamp_utc' in image_data:
caption += f" | Time: {image_data['timestamp_utc']}"
display_image(image_url, caption, base64_data)
with col2:
st.subheader("Metadata")
# Display metadata in an organized way
metadata_to_show = {
'ID': current_image['id'],
'Channel': image_data.get('channel', 'Unknown'),
'Author': image_data.get('author_name', 'Unknown'),
'Nickname': image_data.get('author_nickname', 'Unknown'),
'Author ID': image_data.get('author_id', 'Unknown'),
'Message ID': image_data.get('message_id', 'Unknown'),
'Timestamp': image_data.get('timestamp_utc', 'Unknown'),
'File Extension': image_data.get('file_extension', 'Unknown'),
'File Size': f"{image_data.get('file_size', 0):,} bytes" if image_data.get('file_size') else 'Unknown',
'Message': image_data.get('content', 'No message'),
}
for key, value in metadata_to_show.items():
if value and value != 'Unknown':
st.write(f"**{key}:** {value}")
# Show all other metadata
st.subheader("Raw Data")
with st.expander("Show all metadata"):
st.json(image_data)
else:
st.error("No image URL or base64 data found in this entry")
st.json(image_data)
if __name__ == "__main__":
main()