From ba528a3806687a3cee0c53bfa0ca22c0dccf81ee Mon Sep 17 00:00:00 2001 From: Azeem Fidahusein Date: Mon, 11 Aug 2025 01:21:35 +0100 Subject: [PATCH] image downloader +read me --- IMAGE_DOWNLOADER_README.md | 98 ++++++++++++++++ scripts/image_downloader.py | 228 ++++++++++++++++++++++++++++++++++++ 2 files changed, 326 insertions(+) create mode 100644 IMAGE_DOWNLOADER_README.md create mode 100755 scripts/image_downloader.py diff --git a/IMAGE_DOWNLOADER_README.md b/IMAGE_DOWNLOADER_README.md new file mode 100644 index 0000000..a351877 --- /dev/null +++ b/IMAGE_DOWNLOADER_README.md @@ -0,0 +1,98 @@ +# Discord Image Downloader + +This script processes Discord chat log CSV files to download and convert images to a base64 dataset. + +## Features + +- Parses all CSV files in the `discord_chat_logs/` directory +- Extracts attachment URLs from the `attachment_urls` column +- Downloads images using wget-like functionality (via Python requests) +- Converts images to base64 format for easy storage and processing +- Saves metadata including channel, sender, timestamp, and message context +- Handles Discord CDN URLs with query parameters +- Implements retry logic and rate limiting +- Deduplicates images based on URL hash + +## Setup + +1. Install dependencies: + ```bash + ./setup.sh + ``` + + Or manually: + ```bash + pip3 install -r requirements.txt + ``` + +2. Run the image downloader: + ```bash + cd scripts + python3 image_downloader.py + ``` + +## Output + +The script creates an `images_dataset/` directory containing: + +- `images_dataset.json` - Complete dataset with images in base64 format + +### Dataset Structure + +```json +{ + "metadata": { + "created_at": "2025-08-11 12:34:56 UTC", + "summary": { + "total_images": 42, + "channels": ["memes", "general", "nsfw"], + "total_size_bytes": 1234567, + "file_extensions": [".png", ".jpg", ".gif"], + "authors": ["user1", "user2"] + } + }, + "images": [ + { + "url": "https://cdn.discordapp.com/attachments/...", + "channel": "memes", + "author_name": "username", + "author_nickname": "User Nickname", + "author_id": "123456789", + "message_id": "987654321", + "timestamp_utc": "2020-03-11 18:25:49.086000+00:00", + "content": "Message text content", + "file_extension": ".png", + "file_size": 54321, + "url_hash": "abc123def456", + "base64_data": "iVBORw0KGgoAAAANSUhEUgAA..." + } + ] +} +``` + +## Supported Image Formats + +- PNG (.png) +- JPEG (.jpg, .jpeg) +- GIF (.gif) +- WebP (.webp) +- BMP (.bmp) +- TIFF (.tiff) + +## Configuration + +You can modify the following variables in `image_downloader.py`: + +- `MAX_RETRIES` - Number of download retry attempts (default: 3) +- `DELAY_BETWEEN_REQUESTS` - Delay between requests in seconds (default: 0.5) +- `SUPPORTED_EXTENSIONS` - Set of supported image file extensions + +## Error Handling + +The script includes robust error handling: + +- Skips non-image URLs +- Retries failed downloads with exponential backoff +- Validates content types from server responses +- Continues processing even if individual downloads fail +- Logs all activities and errors to console diff --git a/scripts/image_downloader.py b/scripts/image_downloader.py new file mode 100755 index 0000000..b6d151a --- /dev/null +++ b/scripts/image_downloader.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 +""" +Discord Image Downloader and Base64 Converter + +This script parses all CSV files in the discord_chat_logs directory, +extracts attachment URLs, downloads the images, and saves them in base64 +format with associated metadata (channel and sender information). +""" + +import csv +import os +import base64 +import json +import requests +import urllib.parse +from pathlib import Path +from typing import Dict, List, Optional +import time +import hashlib + +# Configuration +CSV_DIRECTORY = "../discord_chat_logs" +OUTPUT_DIRECTORY = "../images_dataset" +OUTPUT_JSON_FILE = "images_dataset.json" +MAX_RETRIES = 3 +DELAY_BETWEEN_REQUESTS = 0.5 # seconds + +# Supported image extensions +SUPPORTED_EXTENSIONS = {'.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp', '.tiff'} + +class ImageDownloader: + def __init__(self, csv_dir: str, output_dir: str): + self.csv_dir = Path(csv_dir) + self.output_dir = Path(output_dir) + self.output_dir.mkdir(exist_ok=True) + self.session = requests.Session() + self.session.headers.update({ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' + }) + self.images_data = [] + self.processed_urls = set() + + def get_file_extension_from_url(self, url: str) -> Optional[str]: + """Extract file extension from URL, handling Discord CDN URLs.""" + # Parse the URL to get the path + parsed = urllib.parse.urlparse(url) + path = parsed.path.lower() + + # Check for direct extension in path + for ext in SUPPORTED_EXTENSIONS: + if ext in path: + return ext + + # Check query parameters for format info + query_params = urllib.parse.parse_qs(parsed.query) + if 'format' in query_params: + format_val = query_params['format'][0].lower() + if f'.{format_val}' in SUPPORTED_EXTENSIONS: + return f'.{format_val}' + + return None + + def is_image_url(self, url: str) -> bool: + """Check if URL points to an image file.""" + if not url or not url.startswith(('http://', 'https://')): + return False + + return self.get_file_extension_from_url(url) is not None + + def download_image(self, url: str) -> Optional[bytes]: + """Download image from URL with retries.""" + for attempt in range(MAX_RETRIES): + try: + print(f"Downloading: {url} (attempt {attempt + 1})") + response = self.session.get(url, timeout=30) + response.raise_for_status() + + # Verify content is actually an image + content_type = response.headers.get('content-type', '').lower() + if not content_type.startswith('image/'): + print(f"Warning: URL doesn't return image content: {url}") + return None + + return response.content + + except requests.exceptions.RequestException as e: + print(f"Error downloading {url}: {e}") + if attempt < MAX_RETRIES - 1: + time.sleep(DELAY_BETWEEN_REQUESTS * (attempt + 1)) + else: + print(f"Failed to download after {MAX_RETRIES} attempts: {url}") + return None + + return None + + def process_csv_file(self, csv_path: Path) -> None: + """Process a single CSV file to extract and download images.""" + channel_name = csv_path.stem + print(f"\nProcessing channel: {channel_name}") + + try: + with open(csv_path, 'r', encoding='utf-8') as csvfile: + reader = csv.DictReader(csvfile) + + for row_num, row in enumerate(reader, 1): + attachment_urls = row.get('attachment_urls', '').strip() + + if not attachment_urls: + continue + + # Split multiple URLs if they exist (comma-separated) + urls = [url.strip() for url in attachment_urls.split(',') if url.strip()] + + for url in urls: + if url in self.processed_urls: + continue + + if not self.is_image_url(url): + continue + + self.processed_urls.add(url) + + # Download the image + image_data = self.download_image(url) + if image_data is None: + continue + + # Create unique filename based on URL hash + url_hash = hashlib.md5(url.encode()).hexdigest()[:12] + file_extension = self.get_file_extension_from_url(url) or '.unknown' + + # Convert to base64 + base64_data = base64.b64encode(image_data).decode('utf-8') + + # Create metadata + image_metadata = { + 'url': url, + 'channel': channel_name, + 'author_name': row.get('author_name', ''), + 'author_nickname': row.get('author_nickname', ''), + 'author_id': row.get('author_id', ''), + 'message_id': row.get('message_id', ''), + 'timestamp_utc': row.get('timestamp_utc', ''), + 'content': row.get('content', ''), + 'file_extension': file_extension, + 'file_size': len(image_data), + 'url_hash': url_hash, + 'base64_data': base64_data + } + + self.images_data.append(image_metadata) + print(f"āœ“ Downloaded and converted: {url} ({len(image_data)} bytes)") + + # Small delay to be respectful + time.sleep(DELAY_BETWEEN_REQUESTS) + + except Exception as e: + print(f"Error processing {csv_path}: {e}") + + def save_dataset(self) -> None: + """Save the collected images dataset to JSON file.""" + output_file = self.output_dir / OUTPUT_JSON_FILE + + # Create summary statistics + summary = { + 'total_images': len(self.images_data), + 'channels': list(set(img['channel'] for img in self.images_data)), + 'total_size_bytes': sum(img['file_size'] for img in self.images_data), + 'file_extensions': list(set(img['file_extension'] for img in self.images_data)), + 'authors': list(set(img['author_name'] for img in self.images_data if img['author_name'])) + } + + # Prepare final dataset + dataset = { + 'metadata': { + 'created_at': time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime()), + 'summary': summary + }, + 'images': self.images_data + } + + # Save to JSON file + with open(output_file, 'w', encoding='utf-8') as jsonfile: + json.dump(dataset, jsonfile, indent=2, ensure_ascii=False) + + print(f"\nāœ“ Dataset saved to: {output_file}") + print(f"Total images: {summary['total_images']}") + print(f"Total size: {summary['total_size_bytes']:,} bytes") + print(f"Channels: {', '.join(summary['channels'])}") + + def run(self) -> None: + """Main execution function.""" + print("Discord Image Downloader and Base64 Converter") + print("=" * 50) + + # Find all CSV files + csv_files = list(self.csv_dir.glob("*.csv")) + if not csv_files: + print(f"No CSV files found in {self.csv_dir}") + return + + print(f"Found {len(csv_files)} CSV files to process") + + # Process each CSV file + for csv_file in csv_files: + self.process_csv_file(csv_file) + + # Save the final dataset + if self.images_data: + self.save_dataset() + else: + print("\nNo images were found or downloaded.") + +def main(): + """Main entry point.""" + script_dir = Path(__file__).parent + csv_directory = script_dir / CSV_DIRECTORY + output_directory = script_dir / OUTPUT_DIRECTORY + + if not csv_directory.exists(): + print(f"Error: CSV directory not found: {csv_directory}") + return + + downloader = ImageDownloader(str(csv_directory), str(output_directory)) + downloader.run() + +if __name__ == "__main__": + main()