image downloader +read me

2025-08-11 01:21:35 +01:00
parent e22705600a
commit ba528a3806
2 changed files with 326 additions and 0 deletions
--- a/IMAGE_DOWNLOADER_README.md
+++ b/IMAGE_DOWNLOADER_README.md
@@ -0,0 +1,98 @@
 # Discord Image Downloader
 This script processes Discord chat log CSV files to download and convert images to a base64 dataset.
 ## Features
 - Parses all CSV files in the `discord_chat_logs/` directory
 - Extracts attachment URLs from the `attachment_urls` column
 - Downloads images using wget-like functionality (via Python requests)
 - Converts images to base64 format for easy storage and processing
 - Saves metadata including channel, sender, timestamp, and message context
 - Handles Discord CDN URLs with query parameters
 - Implements retry logic and rate limiting
 - Deduplicates images based on URL hash
 ## Setup
 1. Install dependencies:
   ```bash
   ./setup.sh
   ```
   Or manually:
   ```bash
   pip3 install -r requirements.txt
   ```
 2. Run the image downloader:
   ```bash
   cd scripts
   python3 image_downloader.py
   ```
 ## Output
 The script creates an `images_dataset/` directory containing:
 - `images_dataset.json` - Complete dataset with images in base64 format
 ### Dataset Structure
 ```json
 {
  "metadata": {
    "created_at": "2025-08-11 12:34:56 UTC",
    "summary": {
      "total_images": 42,
      "channels": ["memes", "general", "nsfw"],
      "total_size_bytes": 1234567,
      "file_extensions": [".png", ".jpg", ".gif"],
      "authors": ["user1", "user2"]
    }
  },
  "images": [
    {
      "url": "https://cdn.discordapp.com/attachments/...",
      "channel": "memes",
      "author_name": "username",
      "author_nickname": "User Nickname",
      "author_id": "123456789",
      "message_id": "987654321",
      "timestamp_utc": "2020-03-11 18:25:49.086000+00:00",
      "content": "Message text content",
      "file_extension": ".png",
      "file_size": 54321,
      "url_hash": "abc123def456",
      "base64_data": "iVBORw0KGgoAAAANSUhEUgAA..."
    }
  ]
 }
 ```
 ## Supported Image Formats
 - PNG (.png)
 - JPEG (.jpg, .jpeg)
 - GIF (.gif)
 - WebP (.webp)
 - BMP (.bmp)
 - TIFF (.tiff)
 ## Configuration
 You can modify the following variables in `image_downloader.py`:
 - `MAX_RETRIES` - Number of download retry attempts (default: 3)
 - `DELAY_BETWEEN_REQUESTS` - Delay between requests in seconds (default: 0.5)
 - `SUPPORTED_EXTENSIONS` - Set of supported image file extensions
 ## Error Handling
 The script includes robust error handling:
 - Skips non-image URLs
 - Retries failed downloads with exponential backoff
 - Validates content types from server responses
 - Continues processing even if individual downloads fail
 - Logs all activities and errors to console
--- a/scripts/image_downloader.py
+++ b/scripts/image_downloader.py
@@ -0,0 +1,228 @@
 #!/usr/bin/env python3
 """
 Discord Image Downloader and Base64 Converter
 This script parses all CSV files in the discord_chat_logs directory,
 extracts attachment URLs, downloads the images, and saves them in base64
 format with associated metadata (channel and sender information).
 """
 import csv
 import os
 import base64
 import json
 import requests
 import urllib.parse
 from pathlib import Path
 from typing import Dict, List, Optional
 import time
 import hashlib
 # Configuration
 CSV_DIRECTORY = "../discord_chat_logs"
 OUTPUT_DIRECTORY = "../images_dataset"
 OUTPUT_JSON_FILE = "images_dataset.json"
 MAX_RETRIES = 3
 DELAY_BETWEEN_REQUESTS = 0.5  # seconds
 # Supported image extensions
 SUPPORTED_EXTENSIONS = {'.png', '.jpg', '.jpeg', '.gif', '.webp', '.bmp', '.tiff'}
 class ImageDownloader:
    def __init__(self, csv_dir: str, output_dir: str):
        self.csv_dir = Path(csv_dir)
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        self.images_data = []
        self.processed_urls = set()
    def get_file_extension_from_url(self, url: str) -> Optional[str]:
        """Extract file extension from URL, handling Discord CDN URLs."""
        # Parse the URL to get the path
        parsed = urllib.parse.urlparse(url)
        path = parsed.path.lower()
        # Check for direct extension in path
        for ext in SUPPORTED_EXTENSIONS:
            if ext in path:
                return ext
        # Check query parameters for format info
        query_params = urllib.parse.parse_qs(parsed.query)
        if 'format' in query_params:
            format_val = query_params['format'][0].lower()
            if f'.{format_val}' in SUPPORTED_EXTENSIONS:
                return f'.{format_val}'
        return None
    def is_image_url(self, url: str) -> bool:
        """Check if URL points to an image file."""
        if not url or not url.startswith(('http://', 'https://')):
            return False
        return self.get_file_extension_from_url(url) is not None
    def download_image(self, url: str) -> Optional[bytes]:
        """Download image from URL with retries."""
        for attempt in range(MAX_RETRIES):
            try:
                print(f"Downloading: {url} (attempt {attempt + 1})")
                response = self.session.get(url, timeout=30)
                response.raise_for_status()
                # Verify content is actually an image
                content_type = response.headers.get('content-type', '').lower()
                if not content_type.startswith('image/'):
                    print(f"Warning: URL doesn't return image content: {url}")
                    return None
                return response.content
            except requests.exceptions.RequestException as e:
                print(f"Error downloading {url}: {e}")
                if attempt < MAX_RETRIES - 1:
                    time.sleep(DELAY_BETWEEN_REQUESTS * (attempt + 1))
                else:
                    print(f"Failed to download after {MAX_RETRIES} attempts: {url}")
                    return None
        return None
    def process_csv_file(self, csv_path: Path) -> None:
        """Process a single CSV file to extract and download images."""
        channel_name = csv_path.stem
        print(f"\nProcessing channel: {channel_name}")
        try:
            with open(csv_path, 'r', encoding='utf-8') as csvfile:
                reader = csv.DictReader(csvfile)
                for row_num, row in enumerate(reader, 1):
                    attachment_urls = row.get('attachment_urls', '').strip()
                    if not attachment_urls:
                        continue
                    # Split multiple URLs if they exist (comma-separated)
                    urls = [url.strip() for url in attachment_urls.split(',') if url.strip()]
                    for url in urls:
                        if url in self.processed_urls:
                            continue
                        if not self.is_image_url(url):
                            continue
                        self.processed_urls.add(url)
                        # Download the image
                        image_data = self.download_image(url)
                        if image_data is None:
                            continue
                        # Create unique filename based on URL hash
                        url_hash = hashlib.md5(url.encode()).hexdigest()[:12]
                        file_extension = self.get_file_extension_from_url(url) or '.unknown'
                        # Convert to base64
                        base64_data = base64.b64encode(image_data).decode('utf-8')
                        # Create metadata
                        image_metadata = {
                            'url': url,
                            'channel': channel_name,
                            'author_name': row.get('author_name', ''),
                            'author_nickname': row.get('author_nickname', ''),
                            'author_id': row.get('author_id', ''),
                            'message_id': row.get('message_id', ''),
                            'timestamp_utc': row.get('timestamp_utc', ''),
                            'content': row.get('content', ''),
                            'file_extension': file_extension,
                            'file_size': len(image_data),
                            'url_hash': url_hash,
                            'base64_data': base64_data
                        }
                        self.images_data.append(image_metadata)
                        print(f"✓ Downloaded and converted: {url} ({len(image_data)} bytes)")
                        # Small delay to be respectful
                        time.sleep(DELAY_BETWEEN_REQUESTS)
        except Exception as e:
            print(f"Error processing {csv_path}: {e}")
    def save_dataset(self) -> None:
        """Save the collected images dataset to JSON file."""
        output_file = self.output_dir / OUTPUT_JSON_FILE
        # Create summary statistics
        summary = {
            'total_images': len(self.images_data),
            'channels': list(set(img['channel'] for img in self.images_data)),
            'total_size_bytes': sum(img['file_size'] for img in self.images_data),
            'file_extensions': list(set(img['file_extension'] for img in self.images_data)),
            'authors': list(set(img['author_name'] for img in self.images_data if img['author_name']))
        }
        # Prepare final dataset
        dataset = {
            'metadata': {
                'created_at': time.strftime('%Y-%m-%d %H:%M:%S UTC', time.gmtime()),
                'summary': summary
            },
            'images': self.images_data
        }
        # Save to JSON file
        with open(output_file, 'w', encoding='utf-8') as jsonfile:
            json.dump(dataset, jsonfile, indent=2, ensure_ascii=False)
        print(f"\n✓ Dataset saved to: {output_file}")
        print(f"Total images: {summary['total_images']}")
        print(f"Total size: {summary['total_size_bytes']:,} bytes")
        print(f"Channels: {', '.join(summary['channels'])}")
    def run(self) -> None:
        """Main execution function."""
        print("Discord Image Downloader and Base64 Converter")
        print("=" * 50)
        # Find all CSV files
        csv_files = list(self.csv_dir.glob("*.csv"))
        if not csv_files:
            print(f"No CSV files found in {self.csv_dir}")
            return
        print(f"Found {len(csv_files)} CSV files to process")
        # Process each CSV file
        for csv_file in csv_files:
            self.process_csv_file(csv_file)
        # Save the final dataset
        if self.images_data:
            self.save_dataset()
        else:
            print("\nNo images were found or downloaded.")
 def main():
    """Main entry point."""
    script_dir = Path(__file__).parent
    csv_directory = script_dir / CSV_DIRECTORY
    output_directory = script_dir / OUTPUT_DIRECTORY
    if not csv_directory.exists():
        print(f"Error: CSV directory not found: {csv_directory}")
        return
    downloader = ImageDownloader(str(csv_directory), str(output_directory))
    downloader.run()
 if __name__ == "__main__":
    main()