images dataset

image downloader +read me
2025-08-11 01:22:03 +01:00 · 2025-08-11 01:21:35 +01:00
3 changed files with 4379 additions and 0 deletions
--- a/IMAGE_DOWNLOADER_README.md
+++ b/IMAGE_DOWNLOADER_README.md
@@ -0,0 +1,98 @@
+# Discord Image Downloader
+
+This script processes Discord chat log CSV files to download and convert images to a base64 dataset.
+
+## Features
+
+- Parses all CSV files in the `discord_chat_logs/` directory
+- Extracts attachment URLs from the `attachment_urls` column
+- Downloads images using wget-like functionality (via Python requests)
+- Converts images to base64 format for easy storage and processing
+- Saves metadata including channel, sender, timestamp, and message context
+- Handles Discord CDN URLs with query parameters
+- Implements retry logic and rate limiting
+- Deduplicates images based on URL hash
+
+## Setup
+
+1. Install dependencies:
+   ```bash
+   ./setup.sh
+   ```
+
+   Or manually:
+   ```bash
+   pip3 install -r requirements.txt
+   ```
+
+2. Run the image downloader:
+   ```bash
+   cd scripts
+   python3 image_downloader.py
+   ```
+
+## Output
+
+The script creates an `images_dataset/` directory containing:
+
+- `images_dataset.json` - Complete dataset with images in base64 format
+
+### Dataset Structure
+
+```json
+{
+  "metadata": {
+    "created_at": "2025-08-11 12:34:56 UTC",
+    "summary": {
+      "total_images": 42,
+      "channels": ["memes", "general", "nsfw"],
+      "total_size_bytes": 1234567,
+      "file_extensions": [".png", ".jpg", ".gif"],
+      "authors": ["user1", "user2"]
+    }
+  },
+  "images": [
+    {
+      "url": "https://cdn.discordapp.com/attachments/...",
+      "channel": "memes",
+      "author_name": "username",
+      "author_nickname": "User Nickname",
+      "author_id": "123456789",
+      "message_id": "987654321",
+      "timestamp_utc": "2020-03-11 18:25:49.086000+00:00",
+      "content": "Message text content",
+      "file_extension": ".png",
+      "file_size": 54321,
+      "url_hash": "abc123def456",
+      "base64_data": "iVBORw0KGgoAAAANSUhEUgAA..."
+    }
+  ]
+}
+```
+
+## Supported Image Formats
+
+- PNG (.png)
+- JPEG (.jpg, .jpeg)
+- GIF (.gif)
+- WebP (.webp)
+- BMP (.bmp)
+- TIFF (.tiff)
+
+## Configuration
+
+You can modify the following variables in `image_downloader.py`:
+
+- `MAX_RETRIES` - Number of download retry attempts (default: 3)
+- `DELAY_BETWEEN_REQUESTS` - Delay between requests in seconds (default: 0.5)
+- `SUPPORTED_EXTENSIONS` - Set of supported image file extensions
+
+## Error Handling
+
+The script includes robust error handling:
+
+- Skips non-image URLs
+- Retries failed downloads with exponential backoff
+- Validates content types from server responses
+- Continues processing even if individual downloads fail
+- Logs all activities and errors to console
--- a/images_dataset/images_dataset.json
+++ b/images_dataset/images_dataset.json
--- a/scripts/image_downloader.py
+++ b/scripts/image_downloader.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+"""
+Discord Image Downloader and Base64 Converter
+
+This script parses all CSV files in the discord_chat_logs directory,
+extracts attachment URLs, downloads the images, and saves them in base64
+format with associated metadata (channel and sender information).
+"""
+
+import csv
+import os
+import base64
+import json
+import requests
+import urllib.parse
+from pathlib import Path
+from typing import Dict, List, Optional
+import time
+import hashlib
+
+# Configuration
+CSV_DIRECTORY = "../discord_chat_logs"
+OUTPUT_DIRECTORY = "../images_dataset"
+OUTPUT_JSON_FILE = "images_dataset.json"
+MAX_RETRIES = 3
+DELAY_BETWEEN_REQUESTS = 0.5  # seconds
+
+# Supported image extensions
+SUPPORTED_EXTENSIONS = {'.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp', '.tiff'}
+
+class ImageDownloader:
+    def __init__(self, csv_dir: str, output_dir: str):
+        self.csv_dir = Path(csv_dir)
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(exist_ok=True)
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        })
+        self.images_data = []
+        self.processed_urls = set()
+        
+    def get_file_extension_from_url(self, url: str) -> Optional[str]:
+        """Extract file extension from URL, handling Discord CDN URLs."""
+        # Parse the URL to get the path
+        parsed = urllib.parse.urlparse(url)
+        path = parsed.path.lower()
+        
+        # Check for direct extension in path
+        for ext in SUPPORTED_EXTENSIONS:
+            if ext in path:
+                return ext
+        
+        # Check query parameters for format info
+        query_params = urllib.parse.parse_qs(parsed.query)
+        if 'format' in query_params:
+            format_val = query_params['format'][0].lower()
+            if f'.{format_val}' in SUPPORTED_EXTENSIONS:
+                return f'.{format_val}'
+        
+        return None
+    
+    def is_image_url(self, url: str) -> bool:
+        """Check if URL points to an image file."""
+        if not url or not url.startswith(('http://', 'https://')):
+            return False
+        
+        return self.get_file_extension_from_url(url) is not None
+    
+    def download_image(self, url: str) -> Optional[bytes]:
+        """Download image from URL with retries."""
+        for attempt in range(MAX_RETRIES):
+            try:
+                print(f"Downloading: {url} (attempt {attempt + 1})")
+                response = self.session.get(url, timeout=30)
+                response.raise_for_status()
+                
+                # Verify content is actually an image
+                content_type = response.headers.get('content-type', '').lower()
+                if not content_type.startswith('image/'):
+                    print(f"Warning: URL doesn't return image content: {url}")
+                    return None
+                
+                return response.content
+                
+            except requests.exceptions.RequestException as e:
+                print(f"Error downloading {url}: {e}")
+                if attempt < MAX_RETRIES - 1:
+                    time.sleep(DELAY_BETWEEN_REQUESTS * (attempt + 1))
+                else:
+                    print(f"Failed to download after {MAX_RETRIES} attempts: {url}")
+                    return None
+        
+        return None
+    
+    def process_csv_file(self, csv_path: Path) -> None:
+        """Process a single CSV file to extract and download images."""
+        channel_name = csv_path.stem
+        print(f"\nProcessing channel: {channel_name}")
+        
+        try:
+            with open(csv_path, 'r', encoding='utf-8') as csvfile:
+                reader = csv.DictReader(csvfile)
+                
+                for row_num, row in enumerate(reader, 1):
+                    attachment_urls = row.get('attachment_urls', '').strip()
+                    
+                    if not attachment_urls:
+                        continue
+                    
+                    # Split multiple URLs if they exist (comma-separated)
+                    urls = [url.strip() for url in attachment_urls.split(',') if url.strip()]
+                    
+                    for url in urls:
+                        if url in self.processed_urls:
+                            continue
+                        
+                        if not self.is_image_url(url):
+                            continue
+                        
+                        self.processed_urls.add(url)
+                        
+                        # Download the image
+                        image_data = self.download_image(url)
+                        if image_data is None:
+                            continue
+                        
+                        # Create unique filename based on URL hash
+                        url_hash = hashlib.md5(url.encode()).hexdigest()[:12]
+                        file_extension = self.get_file_extension_from_url(url) or '.unknown'
+                        
+                        # Convert to base64
+                        base64_data = base64.b64encode(image_data).decode('utf-8')
+                        
+                        # Create metadata
+                        image_metadata = {
+                            'url': url,
+                            'channel': channel_name,
+                            'author_name': row.get('author_name', ''),
+                            'author_nickname': row.get('author_nickname', ''),
+                            'author_id': row.get('author_id', ''),
+                            'message_id': row.get('message_id', ''),
+                            'timestamp_utc': row.get('timestamp_utc', ''),
+                            'content': row.get('content', ''),
+                            'file_extension': file_extension,
+                            'file_size': len(image_data),
+                            'url_hash': url_hash,
+                            'base64_data': base64_data
+                        }
+                        
+                        self.images_data.append(image_metadata)
+                        print(f"✓ Downloaded and converted: {url} ({len(image_data)} bytes)")
+                        
+                        # Small delay to be respectful
+                        time.sleep(DELAY_BETWEEN_REQUESTS)
+        
+        except Exception as e:
+            print(f"Error processing {csv_path}: {e}")
+    
+    def save_dataset(self) -> None:
+        """Save the collected images dataset to JSON file."""
+        output_file = self.output_dir / OUTPUT_JSON_FILE
+        
+        # Create summary statistics
+        summary = {
+            'total_images': len(self.images_data),
+            'channels': list(set(img['channel'] for img in self.images_data)),
+            'total_size_bytes': sum(img['file_size'] for img in self.images_data),
+            'file_extensions': list(set(img['file_extension'] for img in self.images_data)),
+            'authors': list(set(img['author_name'] for img in self.images_data if img['author_name']))
+        }
+        
+        # Prepare final dataset
+        dataset = {
+            'metadata': {
+                'created_at': time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime()),
+                'summary': summary
+            },
+            'images': self.images_data
+        }
+        
+        # Save to JSON file
+        with open(output_file, 'w', encoding='utf-8') as jsonfile:
+            json.dump(dataset, jsonfile, indent=2, ensure_ascii=False)
+        
+        print(f"\n✓ Dataset saved to: {output_file}")
+        print(f"Total images: {summary['total_images']}")
+        print(f"Total size: {summary['total_size_bytes']:,} bytes")
+        print(f"Channels: {', '.join(summary['channels'])}")
+    
+    def run(self) -> None:
+        """Main execution function."""
+        print("Discord Image Downloader and Base64 Converter")
+        print("=" * 50)
+        
+        # Find all CSV files
+        csv_files = list(self.csv_dir.glob("*.csv"))
+        if not csv_files:
+            print(f"No CSV files found in {self.csv_dir}")
+            return
+        
+        print(f"Found {len(csv_files)} CSV files to process")
+        
+        # Process each CSV file
+        for csv_file in csv_files:
+            self.process_csv_file(csv_file)
+        
+        # Save the final dataset
+        if self.images_data:
+            self.save_dataset()
+        else:
+            print("\nNo images were found or downloaded.")
+
+def main():
+    """Main entry point."""
+    script_dir = Path(__file__).parent
+    csv_directory = script_dir / CSV_DIRECTORY
+    output_directory = script_dir / OUTPUT_DIRECTORY
+    
+    if not csv_directory.exists():
+        print(f"Error: CSV directory not found: {csv_directory}")
+        return
+    
+    downloader = ImageDownloader(str(csv_directory), str(output_directory))
+    downloader.run()
+
+if __name__ == "__main__":
+    main()
Author	SHA1	Message	Date
Azeem Fidahusein	245cc81289	images dataset	2025-08-11 01:22:03 +01:00
Azeem Fidahusein	ba528a3806	image downloader +read me	2025-08-11 01:21:35 +01:00