Compare commits
2 Commits
e22705600a
...
245cc81289
| Author | SHA1 | Date | |
|---|---|---|---|
| 245cc81289 | |||
| ba528a3806 |
98
IMAGE_DOWNLOADER_README.md
Normal file
98
IMAGE_DOWNLOADER_README.md
Normal file
@@ -0,0 +1,98 @@
|
||||
# Discord Image Downloader
|
||||
|
||||
This script processes Discord chat log CSV files to download and convert images to a base64 dataset.
|
||||
|
||||
## Features
|
||||
|
||||
- Parses all CSV files in the `discord_chat_logs/` directory
|
||||
- Extracts attachment URLs from the `attachment_urls` column
|
||||
- Downloads images using wget-like functionality (via Python requests)
|
||||
- Converts images to base64 format for easy storage and processing
|
||||
- Saves metadata including channel, sender, timestamp, and message context
|
||||
- Handles Discord CDN URLs with query parameters
|
||||
- Implements retry logic and rate limiting
|
||||
- Deduplicates images based on URL hash
|
||||
|
||||
## Setup
|
||||
|
||||
1. Install dependencies:
|
||||
```bash
|
||||
./setup.sh
|
||||
```
|
||||
|
||||
Or manually:
|
||||
```bash
|
||||
pip3 install -r requirements.txt
|
||||
```
|
||||
|
||||
2. Run the image downloader:
|
||||
```bash
|
||||
cd scripts
|
||||
python3 image_downloader.py
|
||||
```
|
||||
|
||||
## Output
|
||||
|
||||
The script creates an `images_dataset/` directory containing:
|
||||
|
||||
- `images_dataset.json` - Complete dataset with images in base64 format
|
||||
|
||||
### Dataset Structure
|
||||
|
||||
```json
|
||||
{
|
||||
"metadata": {
|
||||
"created_at": "2025-08-11 12:34:56 UTC",
|
||||
"summary": {
|
||||
"total_images": 42,
|
||||
"channels": ["memes", "general", "nsfw"],
|
||||
"total_size_bytes": 1234567,
|
||||
"file_extensions": [".png", ".jpg", ".gif"],
|
||||
"authors": ["user1", "user2"]
|
||||
}
|
||||
},
|
||||
"images": [
|
||||
{
|
||||
"url": "https://cdn.discordapp.com/attachments/...",
|
||||
"channel": "memes",
|
||||
"author_name": "username",
|
||||
"author_nickname": "User Nickname",
|
||||
"author_id": "123456789",
|
||||
"message_id": "987654321",
|
||||
"timestamp_utc": "2020-03-11 18:25:49.086000+00:00",
|
||||
"content": "Message text content",
|
||||
"file_extension": ".png",
|
||||
"file_size": 54321,
|
||||
"url_hash": "abc123def456",
|
||||
"base64_data": "iVBORw0KGgoAAAANSUhEUgAA..."
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## Supported Image Formats
|
||||
|
||||
- PNG (.png)
|
||||
- JPEG (.jpg, .jpeg)
|
||||
- GIF (.gif)
|
||||
- WebP (.webp)
|
||||
- BMP (.bmp)
|
||||
- TIFF (.tiff)
|
||||
|
||||
## Configuration
|
||||
|
||||
You can modify the following variables in `image_downloader.py`:
|
||||
|
||||
- `MAX_RETRIES` - Number of download retry attempts (default: 3)
|
||||
- `DELAY_BETWEEN_REQUESTS` - Delay between requests in seconds (default: 0.5)
|
||||
- `SUPPORTED_EXTENSIONS` - Set of supported image file extensions
|
||||
|
||||
## Error Handling
|
||||
|
||||
The script includes robust error handling:
|
||||
|
||||
- Skips non-image URLs
|
||||
- Retries failed downloads with exponential backoff
|
||||
- Validates content types from server responses
|
||||
- Continues processing even if individual downloads fail
|
||||
- Logs all activities and errors to console
|
||||
4053
images_dataset/images_dataset.json
Normal file
4053
images_dataset/images_dataset.json
Normal file
File diff suppressed because one or more lines are too long
228
scripts/image_downloader.py
Executable file
228
scripts/image_downloader.py
Executable file
@@ -0,0 +1,228 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Discord Image Downloader and Base64 Converter
|
||||
|
||||
This script parses all CSV files in the discord_chat_logs directory,
|
||||
extracts attachment URLs, downloads the images, and saves them in base64
|
||||
format with associated metadata (channel and sender information).
|
||||
"""
|
||||
|
||||
import csv
|
||||
import os
|
||||
import base64
|
||||
import json
|
||||
import requests
|
||||
import urllib.parse
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
import time
|
||||
import hashlib
|
||||
|
||||
# Configuration
|
||||
CSV_DIRECTORY = "../discord_chat_logs"
|
||||
OUTPUT_DIRECTORY = "../images_dataset"
|
||||
OUTPUT_JSON_FILE = "images_dataset.json"
|
||||
MAX_RETRIES = 3
|
||||
DELAY_BETWEEN_REQUESTS = 0.5 # seconds
|
||||
|
||||
# Supported image extensions
|
||||
SUPPORTED_EXTENSIONS = {'.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp', '.tiff'}
|
||||
|
||||
class ImageDownloader:
|
||||
def __init__(self, csv_dir: str, output_dir: str):
|
||||
self.csv_dir = Path(csv_dir)
|
||||
self.output_dir = Path(output_dir)
|
||||
self.output_dir.mkdir(exist_ok=True)
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
})
|
||||
self.images_data = []
|
||||
self.processed_urls = set()
|
||||
|
||||
def get_file_extension_from_url(self, url: str) -> Optional[str]:
|
||||
"""Extract file extension from URL, handling Discord CDN URLs."""
|
||||
# Parse the URL to get the path
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
path = parsed.path.lower()
|
||||
|
||||
# Check for direct extension in path
|
||||
for ext in SUPPORTED_EXTENSIONS:
|
||||
if ext in path:
|
||||
return ext
|
||||
|
||||
# Check query parameters for format info
|
||||
query_params = urllib.parse.parse_qs(parsed.query)
|
||||
if 'format' in query_params:
|
||||
format_val = query_params['format'][0].lower()
|
||||
if f'.{format_val}' in SUPPORTED_EXTENSIONS:
|
||||
return f'.{format_val}'
|
||||
|
||||
return None
|
||||
|
||||
def is_image_url(self, url: str) -> bool:
|
||||
"""Check if URL points to an image file."""
|
||||
if not url or not url.startswith(('http://', 'https://')):
|
||||
return False
|
||||
|
||||
return self.get_file_extension_from_url(url) is not None
|
||||
|
||||
def download_image(self, url: str) -> Optional[bytes]:
|
||||
"""Download image from URL with retries."""
|
||||
for attempt in range(MAX_RETRIES):
|
||||
try:
|
||||
print(f"Downloading: {url} (attempt {attempt + 1})")
|
||||
response = self.session.get(url, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
# Verify content is actually an image
|
||||
content_type = response.headers.get('content-type', '').lower()
|
||||
if not content_type.startswith('image/'):
|
||||
print(f"Warning: URL doesn't return image content: {url}")
|
||||
return None
|
||||
|
||||
return response.content
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"Error downloading {url}: {e}")
|
||||
if attempt < MAX_RETRIES - 1:
|
||||
time.sleep(DELAY_BETWEEN_REQUESTS * (attempt + 1))
|
||||
else:
|
||||
print(f"Failed to download after {MAX_RETRIES} attempts: {url}")
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
def process_csv_file(self, csv_path: Path) -> None:
|
||||
"""Process a single CSV file to extract and download images."""
|
||||
channel_name = csv_path.stem
|
||||
print(f"\nProcessing channel: {channel_name}")
|
||||
|
||||
try:
|
||||
with open(csv_path, 'r', encoding='utf-8') as csvfile:
|
||||
reader = csv.DictReader(csvfile)
|
||||
|
||||
for row_num, row in enumerate(reader, 1):
|
||||
attachment_urls = row.get('attachment_urls', '').strip()
|
||||
|
||||
if not attachment_urls:
|
||||
continue
|
||||
|
||||
# Split multiple URLs if they exist (comma-separated)
|
||||
urls = [url.strip() for url in attachment_urls.split(',') if url.strip()]
|
||||
|
||||
for url in urls:
|
||||
if url in self.processed_urls:
|
||||
continue
|
||||
|
||||
if not self.is_image_url(url):
|
||||
continue
|
||||
|
||||
self.processed_urls.add(url)
|
||||
|
||||
# Download the image
|
||||
image_data = self.download_image(url)
|
||||
if image_data is None:
|
||||
continue
|
||||
|
||||
# Create unique filename based on URL hash
|
||||
url_hash = hashlib.md5(url.encode()).hexdigest()[:12]
|
||||
file_extension = self.get_file_extension_from_url(url) or '.unknown'
|
||||
|
||||
# Convert to base64
|
||||
base64_data = base64.b64encode(image_data).decode('utf-8')
|
||||
|
||||
# Create metadata
|
||||
image_metadata = {
|
||||
'url': url,
|
||||
'channel': channel_name,
|
||||
'author_name': row.get('author_name', ''),
|
||||
'author_nickname': row.get('author_nickname', ''),
|
||||
'author_id': row.get('author_id', ''),
|
||||
'message_id': row.get('message_id', ''),
|
||||
'timestamp_utc': row.get('timestamp_utc', ''),
|
||||
'content': row.get('content', ''),
|
||||
'file_extension': file_extension,
|
||||
'file_size': len(image_data),
|
||||
'url_hash': url_hash,
|
||||
'base64_data': base64_data
|
||||
}
|
||||
|
||||
self.images_data.append(image_metadata)
|
||||
print(f"✓ Downloaded and converted: {url} ({len(image_data)} bytes)")
|
||||
|
||||
# Small delay to be respectful
|
||||
time.sleep(DELAY_BETWEEN_REQUESTS)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {csv_path}: {e}")
|
||||
|
||||
def save_dataset(self) -> None:
|
||||
"""Save the collected images dataset to JSON file."""
|
||||
output_file = self.output_dir / OUTPUT_JSON_FILE
|
||||
|
||||
# Create summary statistics
|
||||
summary = {
|
||||
'total_images': len(self.images_data),
|
||||
'channels': list(set(img['channel'] for img in self.images_data)),
|
||||
'total_size_bytes': sum(img['file_size'] for img in self.images_data),
|
||||
'file_extensions': list(set(img['file_extension'] for img in self.images_data)),
|
||||
'authors': list(set(img['author_name'] for img in self.images_data if img['author_name']))
|
||||
}
|
||||
|
||||
# Prepare final dataset
|
||||
dataset = {
|
||||
'metadata': {
|
||||
'created_at': time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime()),
|
||||
'summary': summary
|
||||
},
|
||||
'images': self.images_data
|
||||
}
|
||||
|
||||
# Save to JSON file
|
||||
with open(output_file, 'w', encoding='utf-8') as jsonfile:
|
||||
json.dump(dataset, jsonfile, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n✓ Dataset saved to: {output_file}")
|
||||
print(f"Total images: {summary['total_images']}")
|
||||
print(f"Total size: {summary['total_size_bytes']:,} bytes")
|
||||
print(f"Channels: {', '.join(summary['channels'])}")
|
||||
|
||||
def run(self) -> None:
|
||||
"""Main execution function."""
|
||||
print("Discord Image Downloader and Base64 Converter")
|
||||
print("=" * 50)
|
||||
|
||||
# Find all CSV files
|
||||
csv_files = list(self.csv_dir.glob("*.csv"))
|
||||
if not csv_files:
|
||||
print(f"No CSV files found in {self.csv_dir}")
|
||||
return
|
||||
|
||||
print(f"Found {len(csv_files)} CSV files to process")
|
||||
|
||||
# Process each CSV file
|
||||
for csv_file in csv_files:
|
||||
self.process_csv_file(csv_file)
|
||||
|
||||
# Save the final dataset
|
||||
if self.images_data:
|
||||
self.save_dataset()
|
||||
else:
|
||||
print("\nNo images were found or downloaded.")
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
script_dir = Path(__file__).parent
|
||||
csv_directory = script_dir / CSV_DIRECTORY
|
||||
output_directory = script_dir / OUTPUT_DIRECTORY
|
||||
|
||||
if not csv_directory.exists():
|
||||
print(f"Error: CSV directory not found: {csv_directory}")
|
||||
return
|
||||
|
||||
downloader = ImageDownloader(str(csv_directory), str(output_directory))
|
||||
downloader.run()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user