image downloader +read me
This commit is contained in:
98
IMAGE_DOWNLOADER_README.md
Normal file
98
IMAGE_DOWNLOADER_README.md
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
# Discord Image Downloader
|
||||||
|
|
||||||
|
This script processes Discord chat log CSV files to download and convert images to a base64 dataset.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Parses all CSV files in the `discord_chat_logs/` directory
|
||||||
|
- Extracts attachment URLs from the `attachment_urls` column
|
||||||
|
- Downloads images using wget-like functionality (via Python requests)
|
||||||
|
- Converts images to base64 format for easy storage and processing
|
||||||
|
- Saves metadata including channel, sender, timestamp, and message context
|
||||||
|
- Handles Discord CDN URLs with query parameters
|
||||||
|
- Implements retry logic and rate limiting
|
||||||
|
- Deduplicates images based on URL hash
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
1. Install dependencies:
|
||||||
|
```bash
|
||||||
|
./setup.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Or manually:
|
||||||
|
```bash
|
||||||
|
pip3 install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Run the image downloader:
|
||||||
|
```bash
|
||||||
|
cd scripts
|
||||||
|
python3 image_downloader.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## Output
|
||||||
|
|
||||||
|
The script creates an `images_dataset/` directory containing:
|
||||||
|
|
||||||
|
- `images_dataset.json` - Complete dataset with images in base64 format
|
||||||
|
|
||||||
|
### Dataset Structure
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"created_at": "2025-08-11 12:34:56 UTC",
|
||||||
|
"summary": {
|
||||||
|
"total_images": 42,
|
||||||
|
"channels": ["memes", "general", "nsfw"],
|
||||||
|
"total_size_bytes": 1234567,
|
||||||
|
"file_extensions": [".png", ".jpg", ".gif"],
|
||||||
|
"authors": ["user1", "user2"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"images": [
|
||||||
|
{
|
||||||
|
"url": "https://cdn.discordapp.com/attachments/...",
|
||||||
|
"channel": "memes",
|
||||||
|
"author_name": "username",
|
||||||
|
"author_nickname": "User Nickname",
|
||||||
|
"author_id": "123456789",
|
||||||
|
"message_id": "987654321",
|
||||||
|
"timestamp_utc": "2020-03-11 18:25:49.086000+00:00",
|
||||||
|
"content": "Message text content",
|
||||||
|
"file_extension": ".png",
|
||||||
|
"file_size": 54321,
|
||||||
|
"url_hash": "abc123def456",
|
||||||
|
"base64_data": "iVBORw0KGgoAAAANSUhEUgAA..."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Supported Image Formats
|
||||||
|
|
||||||
|
- PNG (.png)
|
||||||
|
- JPEG (.jpg, .jpeg)
|
||||||
|
- GIF (.gif)
|
||||||
|
- WebP (.webp)
|
||||||
|
- BMP (.bmp)
|
||||||
|
- TIFF (.tiff)
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
You can modify the following variables in `image_downloader.py`:
|
||||||
|
|
||||||
|
- `MAX_RETRIES` - Number of download retry attempts (default: 3)
|
||||||
|
- `DELAY_BETWEEN_REQUESTS` - Delay between requests in seconds (default: 0.5)
|
||||||
|
- `SUPPORTED_EXTENSIONS` - Set of supported image file extensions
|
||||||
|
|
||||||
|
## Error Handling
|
||||||
|
|
||||||
|
The script includes robust error handling:
|
||||||
|
|
||||||
|
- Skips non-image URLs
|
||||||
|
- Retries failed downloads with exponential backoff
|
||||||
|
- Validates content types from server responses
|
||||||
|
- Continues processing even if individual downloads fail
|
||||||
|
- Logs all activities and errors to console
|
||||||
228
scripts/image_downloader.py
Executable file
228
scripts/image_downloader.py
Executable file
@@ -0,0 +1,228 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Discord Image Downloader and Base64 Converter
|
||||||
|
|
||||||
|
This script parses all CSV files in the discord_chat_logs directory,
|
||||||
|
extracts attachment URLs, downloads the images, and saves them in base64
|
||||||
|
format with associated metadata (channel and sender information).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
import base64
|
||||||
|
import json
|
||||||
|
import requests
|
||||||
|
import urllib.parse
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
import time
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
CSV_DIRECTORY = "../discord_chat_logs"
|
||||||
|
OUTPUT_DIRECTORY = "../images_dataset"
|
||||||
|
OUTPUT_JSON_FILE = "images_dataset.json"
|
||||||
|
MAX_RETRIES = 3
|
||||||
|
DELAY_BETWEEN_REQUESTS = 0.5 # seconds
|
||||||
|
|
||||||
|
# Supported image extensions
|
||||||
|
SUPPORTED_EXTENSIONS = {'.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp', '.tiff'}
|
||||||
|
|
||||||
|
class ImageDownloader:
|
||||||
|
def __init__(self, csv_dir: str, output_dir: str):
|
||||||
|
self.csv_dir = Path(csv_dir)
|
||||||
|
self.output_dir = Path(output_dir)
|
||||||
|
self.output_dir.mkdir(exist_ok=True)
|
||||||
|
self.session = requests.Session()
|
||||||
|
self.session.headers.update({
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||||
|
})
|
||||||
|
self.images_data = []
|
||||||
|
self.processed_urls = set()
|
||||||
|
|
||||||
|
def get_file_extension_from_url(self, url: str) -> Optional[str]:
|
||||||
|
"""Extract file extension from URL, handling Discord CDN URLs."""
|
||||||
|
# Parse the URL to get the path
|
||||||
|
parsed = urllib.parse.urlparse(url)
|
||||||
|
path = parsed.path.lower()
|
||||||
|
|
||||||
|
# Check for direct extension in path
|
||||||
|
for ext in SUPPORTED_EXTENSIONS:
|
||||||
|
if ext in path:
|
||||||
|
return ext
|
||||||
|
|
||||||
|
# Check query parameters for format info
|
||||||
|
query_params = urllib.parse.parse_qs(parsed.query)
|
||||||
|
if 'format' in query_params:
|
||||||
|
format_val = query_params['format'][0].lower()
|
||||||
|
if f'.{format_val}' in SUPPORTED_EXTENSIONS:
|
||||||
|
return f'.{format_val}'
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def is_image_url(self, url: str) -> bool:
|
||||||
|
"""Check if URL points to an image file."""
|
||||||
|
if not url or not url.startswith(('http://', 'https://')):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return self.get_file_extension_from_url(url) is not None
|
||||||
|
|
||||||
|
def download_image(self, url: str) -> Optional[bytes]:
|
||||||
|
"""Download image from URL with retries."""
|
||||||
|
for attempt in range(MAX_RETRIES):
|
||||||
|
try:
|
||||||
|
print(f"Downloading: {url} (attempt {attempt + 1})")
|
||||||
|
response = self.session.get(url, timeout=30)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# Verify content is actually an image
|
||||||
|
content_type = response.headers.get('content-type', '').lower()
|
||||||
|
if not content_type.startswith('image/'):
|
||||||
|
print(f"Warning: URL doesn't return image content: {url}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
return response.content
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(f"Error downloading {url}: {e}")
|
||||||
|
if attempt < MAX_RETRIES - 1:
|
||||||
|
time.sleep(DELAY_BETWEEN_REQUESTS * (attempt + 1))
|
||||||
|
else:
|
||||||
|
print(f"Failed to download after {MAX_RETRIES} attempts: {url}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_csv_file(self, csv_path: Path) -> None:
|
||||||
|
"""Process a single CSV file to extract and download images."""
|
||||||
|
channel_name = csv_path.stem
|
||||||
|
print(f"\nProcessing channel: {channel_name}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(csv_path, 'r', encoding='utf-8') as csvfile:
|
||||||
|
reader = csv.DictReader(csvfile)
|
||||||
|
|
||||||
|
for row_num, row in enumerate(reader, 1):
|
||||||
|
attachment_urls = row.get('attachment_urls', '').strip()
|
||||||
|
|
||||||
|
if not attachment_urls:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Split multiple URLs if they exist (comma-separated)
|
||||||
|
urls = [url.strip() for url in attachment_urls.split(',') if url.strip()]
|
||||||
|
|
||||||
|
for url in urls:
|
||||||
|
if url in self.processed_urls:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not self.is_image_url(url):
|
||||||
|
continue
|
||||||
|
|
||||||
|
self.processed_urls.add(url)
|
||||||
|
|
||||||
|
# Download the image
|
||||||
|
image_data = self.download_image(url)
|
||||||
|
if image_data is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Create unique filename based on URL hash
|
||||||
|
url_hash = hashlib.md5(url.encode()).hexdigest()[:12]
|
||||||
|
file_extension = self.get_file_extension_from_url(url) or '.unknown'
|
||||||
|
|
||||||
|
# Convert to base64
|
||||||
|
base64_data = base64.b64encode(image_data).decode('utf-8')
|
||||||
|
|
||||||
|
# Create metadata
|
||||||
|
image_metadata = {
|
||||||
|
'url': url,
|
||||||
|
'channel': channel_name,
|
||||||
|
'author_name': row.get('author_name', ''),
|
||||||
|
'author_nickname': row.get('author_nickname', ''),
|
||||||
|
'author_id': row.get('author_id', ''),
|
||||||
|
'message_id': row.get('message_id', ''),
|
||||||
|
'timestamp_utc': row.get('timestamp_utc', ''),
|
||||||
|
'content': row.get('content', ''),
|
||||||
|
'file_extension': file_extension,
|
||||||
|
'file_size': len(image_data),
|
||||||
|
'url_hash': url_hash,
|
||||||
|
'base64_data': base64_data
|
||||||
|
}
|
||||||
|
|
||||||
|
self.images_data.append(image_metadata)
|
||||||
|
print(f"✓ Downloaded and converted: {url} ({len(image_data)} bytes)")
|
||||||
|
|
||||||
|
# Small delay to be respectful
|
||||||
|
time.sleep(DELAY_BETWEEN_REQUESTS)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing {csv_path}: {e}")
|
||||||
|
|
||||||
|
def save_dataset(self) -> None:
|
||||||
|
"""Save the collected images dataset to JSON file."""
|
||||||
|
output_file = self.output_dir / OUTPUT_JSON_FILE
|
||||||
|
|
||||||
|
# Create summary statistics
|
||||||
|
summary = {
|
||||||
|
'total_images': len(self.images_data),
|
||||||
|
'channels': list(set(img['channel'] for img in self.images_data)),
|
||||||
|
'total_size_bytes': sum(img['file_size'] for img in self.images_data),
|
||||||
|
'file_extensions': list(set(img['file_extension'] for img in self.images_data)),
|
||||||
|
'authors': list(set(img['author_name'] for img in self.images_data if img['author_name']))
|
||||||
|
}
|
||||||
|
|
||||||
|
# Prepare final dataset
|
||||||
|
dataset = {
|
||||||
|
'metadata': {
|
||||||
|
'created_at': time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime()),
|
||||||
|
'summary': summary
|
||||||
|
},
|
||||||
|
'images': self.images_data
|
||||||
|
}
|
||||||
|
|
||||||
|
# Save to JSON file
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as jsonfile:
|
||||||
|
json.dump(dataset, jsonfile, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
print(f"\n✓ Dataset saved to: {output_file}")
|
||||||
|
print(f"Total images: {summary['total_images']}")
|
||||||
|
print(f"Total size: {summary['total_size_bytes']:,} bytes")
|
||||||
|
print(f"Channels: {', '.join(summary['channels'])}")
|
||||||
|
|
||||||
|
def run(self) -> None:
|
||||||
|
"""Main execution function."""
|
||||||
|
print("Discord Image Downloader and Base64 Converter")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
# Find all CSV files
|
||||||
|
csv_files = list(self.csv_dir.glob("*.csv"))
|
||||||
|
if not csv_files:
|
||||||
|
print(f"No CSV files found in {self.csv_dir}")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"Found {len(csv_files)} CSV files to process")
|
||||||
|
|
||||||
|
# Process each CSV file
|
||||||
|
for csv_file in csv_files:
|
||||||
|
self.process_csv_file(csv_file)
|
||||||
|
|
||||||
|
# Save the final dataset
|
||||||
|
if self.images_data:
|
||||||
|
self.save_dataset()
|
||||||
|
else:
|
||||||
|
print("\nNo images were found or downloaded.")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main entry point."""
|
||||||
|
script_dir = Path(__file__).parent
|
||||||
|
csv_directory = script_dir / CSV_DIRECTORY
|
||||||
|
output_directory = script_dir / OUTPUT_DIRECTORY
|
||||||
|
|
||||||
|
if not csv_directory.exists():
|
||||||
|
print(f"Error: CSV directory not found: {csv_directory}")
|
||||||
|
return
|
||||||
|
|
||||||
|
downloader = ImageDownloader(str(csv_directory), str(output_directory))
|
||||||
|
downloader.run()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user