Compare commits

..

2 Commits

Author SHA1 Message Date
245cc81289 images dataset 2025-08-11 01:22:03 +01:00
ba528a3806 image downloader +read me 2025-08-11 01:21:35 +01:00
3 changed files with 4379 additions and 0 deletions

View File

@@ -0,0 +1,98 @@
# Discord Image Downloader
This script processes Discord chat log CSV files to download and convert images to a base64 dataset.
## Features
- Parses all CSV files in the `discord_chat_logs/` directory
- Extracts attachment URLs from the `attachment_urls` column
- Downloads images using wget-like functionality (via Python requests)
- Converts images to base64 format for easy storage and processing
- Saves metadata including channel, sender, timestamp, and message context
- Handles Discord CDN URLs with query parameters
- Implements retry logic and rate limiting
- Deduplicates images based on URL hash
## Setup
1. Install dependencies:
```bash
./setup.sh
```
Or manually:
```bash
pip3 install -r requirements.txt
```
2. Run the image downloader:
```bash
cd scripts
python3 image_downloader.py
```
## Output
The script creates an `images_dataset/` directory containing:
- `images_dataset.json` - Complete dataset with images in base64 format
### Dataset Structure
```json
{
"metadata": {
"created_at": "2025-08-11 12:34:56 UTC",
"summary": {
"total_images": 42,
"channels": ["memes", "general", "nsfw"],
"total_size_bytes": 1234567,
"file_extensions": [".png", ".jpg", ".gif"],
"authors": ["user1", "user2"]
}
},
"images": [
{
"url": "https://cdn.discordapp.com/attachments/...",
"channel": "memes",
"author_name": "username",
"author_nickname": "User Nickname",
"author_id": "123456789",
"message_id": "987654321",
"timestamp_utc": "2020-03-11 18:25:49.086000+00:00",
"content": "Message text content",
"file_extension": ".png",
"file_size": 54321,
"url_hash": "abc123def456",
"base64_data": "iVBORw0KGgoAAAANSUhEUgAA..."
}
]
}
```
## Supported Image Formats
- PNG (.png)
- JPEG (.jpg, .jpeg)
- GIF (.gif)
- WebP (.webp)
- BMP (.bmp)
- TIFF (.tiff)
## Configuration
You can modify the following variables in `image_downloader.py`:
- `MAX_RETRIES` - Number of download retry attempts (default: 3)
- `DELAY_BETWEEN_REQUESTS` - Delay between requests in seconds (default: 0.5)
- `SUPPORTED_EXTENSIONS` - Set of supported image file extensions
## Error Handling
The script includes robust error handling:
- Skips non-image URLs
- Retries failed downloads with exponential backoff
- Validates content types from server responses
- Continues processing even if individual downloads fail
- Logs all activities and errors to console

File diff suppressed because one or more lines are too long

228
scripts/image_downloader.py Executable file
View File

@@ -0,0 +1,228 @@
#!/usr/bin/env python3
"""
Discord Image Downloader and Base64 Converter
This script parses all CSV files in the discord_chat_logs directory,
extracts attachment URLs, downloads the images, and saves them in base64
format with associated metadata (channel and sender information).
"""
import csv
import os
import base64
import json
import requests
import urllib.parse
from pathlib import Path
from typing import Dict, List, Optional
import time
import hashlib
# Configuration
CSV_DIRECTORY = "../discord_chat_logs"
OUTPUT_DIRECTORY = "../images_dataset"
OUTPUT_JSON_FILE = "images_dataset.json"
MAX_RETRIES = 3
DELAY_BETWEEN_REQUESTS = 0.5 # seconds
# Supported image extensions
SUPPORTED_EXTENSIONS = {'.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp', '.tiff'}
class ImageDownloader:
def __init__(self, csv_dir: str, output_dir: str):
self.csv_dir = Path(csv_dir)
self.output_dir = Path(output_dir)
self.output_dir.mkdir(exist_ok=True)
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
self.images_data = []
self.processed_urls = set()
def get_file_extension_from_url(self, url: str) -> Optional[str]:
"""Extract file extension from URL, handling Discord CDN URLs."""
# Parse the URL to get the path
parsed = urllib.parse.urlparse(url)
path = parsed.path.lower()
# Check for direct extension in path
for ext in SUPPORTED_EXTENSIONS:
if ext in path:
return ext
# Check query parameters for format info
query_params = urllib.parse.parse_qs(parsed.query)
if 'format' in query_params:
format_val = query_params['format'][0].lower()
if f'.{format_val}' in SUPPORTED_EXTENSIONS:
return f'.{format_val}'
return None
def is_image_url(self, url: str) -> bool:
"""Check if URL points to an image file."""
if not url or not url.startswith(('http://', 'https://')):
return False
return self.get_file_extension_from_url(url) is not None
def download_image(self, url: str) -> Optional[bytes]:
"""Download image from URL with retries."""
for attempt in range(MAX_RETRIES):
try:
print(f"Downloading: {url} (attempt {attempt + 1})")
response = self.session.get(url, timeout=30)
response.raise_for_status()
# Verify content is actually an image
content_type = response.headers.get('content-type', '').lower()
if not content_type.startswith('image/'):
print(f"Warning: URL doesn't return image content: {url}")
return None
return response.content
except requests.exceptions.RequestException as e:
print(f"Error downloading {url}: {e}")
if attempt < MAX_RETRIES - 1:
time.sleep(DELAY_BETWEEN_REQUESTS * (attempt + 1))
else:
print(f"Failed to download after {MAX_RETRIES} attempts: {url}")
return None
return None
def process_csv_file(self, csv_path: Path) -> None:
"""Process a single CSV file to extract and download images."""
channel_name = csv_path.stem
print(f"\nProcessing channel: {channel_name}")
try:
with open(csv_path, 'r', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile)
for row_num, row in enumerate(reader, 1):
attachment_urls = row.get('attachment_urls', '').strip()
if not attachment_urls:
continue
# Split multiple URLs if they exist (comma-separated)
urls = [url.strip() for url in attachment_urls.split(',') if url.strip()]
for url in urls:
if url in self.processed_urls:
continue
if not self.is_image_url(url):
continue
self.processed_urls.add(url)
# Download the image
image_data = self.download_image(url)
if image_data is None:
continue
# Create unique filename based on URL hash
url_hash = hashlib.md5(url.encode()).hexdigest()[:12]
file_extension = self.get_file_extension_from_url(url) or '.unknown'
# Convert to base64
base64_data = base64.b64encode(image_data).decode('utf-8')
# Create metadata
image_metadata = {
'url': url,
'channel': channel_name,
'author_name': row.get('author_name', ''),
'author_nickname': row.get('author_nickname', ''),
'author_id': row.get('author_id', ''),
'message_id': row.get('message_id', ''),
'timestamp_utc': row.get('timestamp_utc', ''),
'content': row.get('content', ''),
'file_extension': file_extension,
'file_size': len(image_data),
'url_hash': url_hash,
'base64_data': base64_data
}
self.images_data.append(image_metadata)
print(f"✓ Downloaded and converted: {url} ({len(image_data)} bytes)")
# Small delay to be respectful
time.sleep(DELAY_BETWEEN_REQUESTS)
except Exception as e:
print(f"Error processing {csv_path}: {e}")
def save_dataset(self) -> None:
"""Save the collected images dataset to JSON file."""
output_file = self.output_dir / OUTPUT_JSON_FILE
# Create summary statistics
summary = {
'total_images': len(self.images_data),
'channels': list(set(img['channel'] for img in self.images_data)),
'total_size_bytes': sum(img['file_size'] for img in self.images_data),
'file_extensions': list(set(img['file_extension'] for img in self.images_data)),
'authors': list(set(img['author_name'] for img in self.images_data if img['author_name']))
}
# Prepare final dataset
dataset = {
'metadata': {
'created_at': time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime()),
'summary': summary
},
'images': self.images_data
}
# Save to JSON file
with open(output_file, 'w', encoding='utf-8') as jsonfile:
json.dump(dataset, jsonfile, indent=2, ensure_ascii=False)
print(f"\n✓ Dataset saved to: {output_file}")
print(f"Total images: {summary['total_images']}")
print(f"Total size: {summary['total_size_bytes']:,} bytes")
print(f"Channels: {', '.join(summary['channels'])}")
def run(self) -> None:
"""Main execution function."""
print("Discord Image Downloader and Base64 Converter")
print("=" * 50)
# Find all CSV files
csv_files = list(self.csv_dir.glob("*.csv"))
if not csv_files:
print(f"No CSV files found in {self.csv_dir}")
return
print(f"Found {len(csv_files)} CSV files to process")
# Process each CSV file
for csv_file in csv_files:
self.process_csv_file(csv_file)
# Save the final dataset
if self.images_data:
self.save_dataset()
else:
print("\nNo images were found or downloaded.")
def main():
"""Main entry point."""
script_dir = Path(__file__).parent
csv_directory = script_dir / CSV_DIRECTORY
output_directory = script_dir / OUTPUT_DIRECTORY
if not csv_directory.exists():
print(f"Error: CSV directory not found: {csv_directory}")
return
downloader = ImageDownloader(str(csv_directory), str(output_directory))
downloader.run()
if __name__ == "__main__":
main()