commit

2025-09-09 12:43:38 -04:00
parent 698ec83c96
commit a73ec73921
14 changed files with 2646 additions and 0 deletions
--- a/f/CCR_ETL/ccr_etl_mtgjson.py
+++ b/f/CCR_ETL/ccr_etl_mtgjson.py
@@ -0,0 +1,377 @@
+"""
+CCR ETL MTGJSON Processing Script
+
+This script handles the extraction, transformation, and loading of MTGJSON data
+into a PostgreSQL database. It supports downloading, unzipping, preprocessing,
+and batch inserting of various data formats.
+"""
+
+import json
+import os
+import yaml
+from typing import Union
+from zipfile import ZipFile
+
+import psycopg2
+import requests
+import wmill
+from sqlalchemy import create_engine, text
+from sqlalchemy.engine import Engine
+
+# Configuration paths
+DB_RESOURCE_PATH = 'u/joshuakrzemien/slick_postgresql'
+DB_CONFIG_PATH = 'f/CCR_ETL/ccr_db_config'
+EXTRACT_CONFIG_PATH = 'f/CCR_ETL/ccr_extract_config'
+DOWNLOAD_CONFIG_PATH = './shared/'
+
+# Default processing parameters
+DEFAULT_BATCH_SIZE = 1000
+
+def validate_response_headers(response: requests.Response, outer_file_type: str) -> None:
+    """Validate that the response content type matches the expected file type."""
+    if response.headers['Content-Type'] != f'application/{outer_file_type}':
+        raise ValueError(f"Expected {outer_file_type} response, got {response.headers['Content-Type']}")
+
+
+def download_request(url: str, filename: str, outer_file_type: str) -> bytes:
+    """Download a file from the given URL and validate its content type."""
+    print(f"🔽 Downloading {filename} from {url}")
+    
+    response = requests.get(url)
+    response.raise_for_status()
+    validate_response_headers(response, outer_file_type)
+    
+    print(f"✅ Download successful ({response.headers.get('Content-Length', 'Unknown')} bytes)")
+    return response.content
+
+def generate_download_queue(url: str, filename: str, outer_file_type: str, iterables: dict) -> list:
+    """
+    Generate a queue of download items based on URL templates and iterable values.
+    
+    Example:
+        url = "https://tcgcsv.com/tcgplayer/{game_id}/groups"
+        iterables = {'game_id': [1,3,65,71,86]}
+    """
+    queue = []
+    for key, value in iterables.items():
+        for item in value:
+            queue_item = {
+                'url': url.format(key=key, value=item),
+                'filename': filename.format(key=key, value=item),
+                'outer_file_type': outer_file_type,
+            }
+            queue.append(queue_item)
+    return queue
+
+
+def save_file(content: bytes, filename: str) -> None:
+    """Save binary content to a file in the download directory."""
+    filepath = DOWNLOAD_CONFIG_PATH + filename
+    with open(filepath, 'wb') as f:
+        f.write(content)
+    print(f"💾 Saved {len(content)} bytes to {filename}")
+
+
+def unzip_file(filename: str) -> str:
+    """Extract a zip file and return the name of the extracted content."""
+    new_filename = filename.replace('.zip', '')
+    zip_path = DOWNLOAD_CONFIG_PATH + filename
+    
+    with ZipFile(zip_path, 'r') as zip_ref:
+        file_list = zip_ref.namelist()
+        print(f"📦 Extracting {len(file_list)} files from {filename}")
+        zip_ref.extractall(DOWNLOAD_CONFIG_PATH)
+    
+    return new_filename
+
+def load_file(filename: str, file_type: str) -> Union[dict, list]:
+    """Load and parse a file from the download directory."""
+    filepath = DOWNLOAD_CONFIG_PATH + filename
+    
+    if file_type == 'json':
+        with open(filepath, 'r') as f:
+            data = json.load(f)
+            print(f"📖 Loaded {file_type} file: {filename}")
+            return data
+    else:
+        raise ValueError(f"Unsupported file type: {file_type}")
+
+def build_record_from_config(source_data: dict, expected_columns: list, additional_data: dict = None) -> dict:
+    """
+    Build a record using the structure defined in the extract config.
+    
+    Args:
+        source_data: The source data dictionary
+        expected_columns: List of column definitions from config
+        additional_data: Optional additional data to merge (e.g., parent UUID)
+    
+    Returns:
+        Dictionary representing a single database record
+    """
+    if additional_data is None:
+        additional_data = {}
+    
+    # Merge source data with additional data (like uuid from parent structure)
+    combined_data = {**source_data, **additional_data}
+    
+    record = {}
+    for column in expected_columns:
+        col_name = column['name']
+        # Skip auto-increment columns (like 'id')
+        if column.get('auto_increment', False):
+            continue
+        # Get value from combined data, use empty string as default
+        record[col_name] = combined_data.get(col_name, '')
+    
+    return record
+
+def create_db_engine(db: dict) -> Engine:
+    """Create and test a database engine connection."""
+    db_url = f"postgresql+psycopg2://postgres:{db['password']}@{db['host']}:{db['port']}/{db['dbname']}"
+    engine = create_engine(db_url)
+    
+    # Test connection
+    conn = engine.connect()
+    conn.close()
+    print(f"🔌 Connected to database: {db['host']}:{db['port']}/{db['dbname']}")
+    
+    return engine
+
+
+def get_db_engine() -> Engine:
+    """Get a database engine using the configured resource."""
+    db = wmill.client.get_resource(DB_RESOURCE_PATH)
+    return create_db_engine(db)
+
+def generic_preprocess(
+    data: Union[dict, list],
+    expected_columns: list,
+    config: dict
+) -> list:
+    """
+    Generic data preprocessing function that handles various data structures.
+    
+    Args:
+        data: Source data (dict or list)
+        expected_columns: List of column definitions
+        config: Preprocessing configuration
+    
+    Returns:
+        List of processed records
+    """
+    # Step 1: Follow data path
+    data_path = config.get("data_path", [])
+    for key in data_path:
+        if not isinstance(data, dict):
+            raise ValueError(f"Expected dict while navigating path, got {type(data)} at key '{key}'")
+        data = data.get(key)
+        if data is None:
+            raise ValueError(f"Missing key '{key}' in data path: {data_path}")
+
+    # Step 2: Handle nested structure
+    nested = config.get("nested", False)
+    nested_key = config.get("nested_key", None)
+    id_key = config.get("id_key", None)
+    flatten = config.get("flatten", False)
+    
+    records = []
+    
+    if isinstance(data, dict):
+        items = data.items()
+    elif isinstance(data, list):
+        items = enumerate(data)
+    else:
+        raise ValueError(f"Unsupported data structure: {type(data)}")
+    
+    for outer_key, outer_value in items:
+        if nested:
+            if not isinstance(outer_value, list):
+                continue
+            for inner_value in outer_value:
+                if id_key and not inner_value.get(id_key):
+                    continue
+                additional_data = {nested_key: outer_key} if nested_key else {}
+                record = build_record_from_config(inner_value, expected_columns, additional_data)
+                records.append(record)
+        else:
+            if not isinstance(outer_value, dict):
+                continue
+            if id_key and not outer_value.get(id_key):
+                continue
+            if flatten:
+                nested_data = outer_value.get("identifiers", {})
+                combined = {**nested_data, "uuid": outer_value.get("uuid")}
+                record = build_record_from_config(combined, expected_columns)
+            else:
+                record = build_record_from_config(outer_value, expected_columns)
+            records.append(record)
+    
+    print(f"🔄 Processed {len(records)} records")
+    return records
+
+def control_batch(data: list, batch_size: int = DEFAULT_BATCH_SIZE):
+    """Split data into batches for processing."""
+    for i in range(0, len(data), batch_size):
+        yield data[i:i+batch_size]
+
+
+def insert_data_into_table_batch(records: list, table: str, engine: Engine, batch_size: int = DEFAULT_BATCH_SIZE) -> None:
+    """Insert records into database table in batches."""
+    if not records:
+        print("⚠️ No records to insert, skipping database operation")
+        return
+    
+    print(f"💾 Inserting {len(records)} records into {table} (batch size: {batch_size})")
+    
+    # Get column names from first record
+    columns = list(records[0].keys())
+    column_names = ', '.join(f'"{col}"' for col in columns)
+    placeholders = ', '.join([f':{col}' for col in columns])
+    
+    insert_sql = f"INSERT INTO {table} ({column_names}) VALUES ({placeholders})"
+    
+    with engine.connect() as conn:
+        batch_count = 0
+        total_inserted = 0
+        
+        for batch in control_batch(records, batch_size):
+            batch_count += 1
+            batch_size_actual = len(batch)
+            
+            conn.execute(text(insert_sql), batch)
+            total_inserted += batch_size_actual
+            
+            if batch_count % 10 == 0:
+                print(f"⏳ Inserted {total_inserted}/{len(records)} records...")
+        
+        conn.commit()
+        print(f"✅ Inserted {total_inserted} records in {batch_count} batches")
+
+
+def process_job(job: dict) -> dict:
+    """
+    Process a single ETL job.
+    
+    Args:
+        job: Job configuration dictionary
+    
+    Returns:
+        Dictionary with job processing results
+    """
+    # Extract job parameters
+    url = job.get('url')
+    filename = job.get('filename')
+    outer_file_type = job.get('outer_file_type')
+    inner_file_type = job.get('inner_file_type')
+    table = job.get('table')
+    expected_columns = job.get('expected_columns')
+    batch_size = job.get('batch_size', DEFAULT_BATCH_SIZE)
+    preprocess_function_name = job.get('preprocess_function', 'generic_preprocess')
+    preprocess_config = job.get('preprocess_config')
+    active = job.get('active')
+    iterables = job.get('iterables')
+    
+    print(f"\n🚀 Processing job for table '{table}'")
+    
+    if not active:
+        print(f"⚠️ Job is not active, skipping")
+        return {"status": "skipped"}
+    
+    # Get preprocessing function
+    if isinstance(preprocess_function_name, str):
+        preprocess_function = globals().get(preprocess_function_name)
+    if not callable(preprocess_function):
+        raise ValueError(f"Preprocessing function '{preprocess_function_name}' not found or not callable.")
+    
+    # Get database engine
+    engine = get_db_engine()
+    
+    # Populate download queue
+    if iterables:
+        queue = generate_download_queue(url, filename, outer_file_type, iterables)
+    else:
+        queue = [{
+            'url': url,
+            'filename': filename,
+            'outer_file_type': outer_file_type,
+            'inner_file_type': inner_file_type,
+            'table': table,
+            'expected_columns': expected_columns
+        }]
+    
+    # Process download queue
+    for queue_item in queue:
+        content = download_request(queue_item.get('url'), queue_item.get('filename'), queue_item.get('outer_file_type'))
+        save_file(content, queue_item.get('filename'))
+    
+    # Handle file extraction if needed
+    saved_filename = filename
+    if outer_file_type == 'zip':
+        saved_filename = unzip_file(filename)
+    
+    # Load and preprocess data
+    data = load_file(saved_filename, inner_file_type)
+    records = preprocess_function(data, expected_columns, preprocess_config)
+    
+    # Insert data into database
+    insert_data_into_table_batch(records, table, engine, batch_size)
+    
+    result = {
+        "status": "success",
+        "table": table,
+        "records_processed": len(records),
+        "filename": saved_filename
+    }
+    
+    print(f"✅ Job complete: {len(records)} records processed for {table}")
+    return result
+
+
+
+def main() -> dict:
+    """
+    Main ETL processing function.
+    
+    Returns:
+        Dictionary with overall processing results
+    """
+    print("🎯 ETL Process Starting")
+    print("=" * 50)
+    
+    # Load configuration
+    config_yaml = wmill.get_variable(EXTRACT_CONFIG_PATH)
+    config = yaml.safe_load(config_yaml)
+    print(f"📋 Processing {len(config['jobs'])} jobs")
+    
+    results = []
+    successful_jobs = 0
+    failed_jobs = 0
+    
+    for i, job in enumerate(config['jobs'], 1):
+        print(f"\n--- Job {i}/{len(config['jobs'])} ---")
+        
+        try:
+            result = process_job(job)
+            results.append(result)
+            successful_jobs += 1
+        except Exception as e:
+            error_result = {
+                "status": "error",
+                "table": job.get('table', 'unknown'),
+                "error": str(e),
+                "filename": job.get('filename', 'unknown')
+            }
+            results.append(error_result)
+            failed_jobs += 1
+            print(f"❌ Job {i} failed: {str(e)}")
+    
+    print(f"\n🏁 ETL Process Complete")
+    print(f"✅ Successful: {successful_jobs} | ❌ Failed: {failed_jobs} | 📋 Total: {len(results)}")
+    
+    return {
+        "status": "completed",
+        "jobs_processed": len(results),
+        "successful_jobs": successful_jobs,
+        "failed_jobs": failed_jobs,
+        "results": results
+    }