This commit is contained in:
2025-09-09 12:43:38 -04:00
parent 698ec83c96
commit a73ec73921
14 changed files with 2646 additions and 0 deletions

View File

@@ -0,0 +1,377 @@
"""
CCR ETL MTGJSON Processing Script
This script handles the extraction, transformation, and loading of MTGJSON data
into a PostgreSQL database. It supports downloading, unzipping, preprocessing,
and batch inserting of various data formats.
"""
import json
import os
import yaml
from typing import Union
from zipfile import ZipFile
import psycopg2
import requests
import wmill
from sqlalchemy import create_engine, text
from sqlalchemy.engine import Engine
# Configuration paths
DB_RESOURCE_PATH = 'u/joshuakrzemien/slick_postgresql'
DB_CONFIG_PATH = 'f/CCR_ETL/ccr_db_config'
EXTRACT_CONFIG_PATH = 'f/CCR_ETL/ccr_extract_config'
DOWNLOAD_CONFIG_PATH = './shared/'
# Default processing parameters
DEFAULT_BATCH_SIZE = 1000
def validate_response_headers(response: requests.Response, outer_file_type: str) -> None:
"""Validate that the response content type matches the expected file type."""
if response.headers['Content-Type'] != f'application/{outer_file_type}':
raise ValueError(f"Expected {outer_file_type} response, got {response.headers['Content-Type']}")
def download_request(url: str, filename: str, outer_file_type: str) -> bytes:
"""Download a file from the given URL and validate its content type."""
print(f"🔽 Downloading {filename} from {url}")
response = requests.get(url)
response.raise_for_status()
validate_response_headers(response, outer_file_type)
print(f"✅ Download successful ({response.headers.get('Content-Length', 'Unknown')} bytes)")
return response.content
def generate_download_queue(url: str, filename: str, outer_file_type: str, iterables: dict) -> list:
"""
Generate a queue of download items based on URL templates and iterable values.
Example:
url = "https://tcgcsv.com/tcgplayer/{game_id}/groups"
iterables = {'game_id': [1,3,65,71,86]}
"""
queue = []
for key, value in iterables.items():
for item in value:
queue_item = {
'url': url.format(key=key, value=item),
'filename': filename.format(key=key, value=item),
'outer_file_type': outer_file_type,
}
queue.append(queue_item)
return queue
def save_file(content: bytes, filename: str) -> None:
"""Save binary content to a file in the download directory."""
filepath = DOWNLOAD_CONFIG_PATH + filename
with open(filepath, 'wb') as f:
f.write(content)
print(f"💾 Saved {len(content)} bytes to {filename}")
def unzip_file(filename: str) -> str:
"""Extract a zip file and return the name of the extracted content."""
new_filename = filename.replace('.zip', '')
zip_path = DOWNLOAD_CONFIG_PATH + filename
with ZipFile(zip_path, 'r') as zip_ref:
file_list = zip_ref.namelist()
print(f"📦 Extracting {len(file_list)} files from {filename}")
zip_ref.extractall(DOWNLOAD_CONFIG_PATH)
return new_filename
def load_file(filename: str, file_type: str) -> Union[dict, list]:
"""Load and parse a file from the download directory."""
filepath = DOWNLOAD_CONFIG_PATH + filename
if file_type == 'json':
with open(filepath, 'r') as f:
data = json.load(f)
print(f"📖 Loaded {file_type} file: {filename}")
return data
else:
raise ValueError(f"Unsupported file type: {file_type}")
def build_record_from_config(source_data: dict, expected_columns: list, additional_data: dict = None) -> dict:
"""
Build a record using the structure defined in the extract config.
Args:
source_data: The source data dictionary
expected_columns: List of column definitions from config
additional_data: Optional additional data to merge (e.g., parent UUID)
Returns:
Dictionary representing a single database record
"""
if additional_data is None:
additional_data = {}
# Merge source data with additional data (like uuid from parent structure)
combined_data = {**source_data, **additional_data}
record = {}
for column in expected_columns:
col_name = column['name']
# Skip auto-increment columns (like 'id')
if column.get('auto_increment', False):
continue
# Get value from combined data, use empty string as default
record[col_name] = combined_data.get(col_name, '')
return record
def create_db_engine(db: dict) -> Engine:
"""Create and test a database engine connection."""
db_url = f"postgresql+psycopg2://postgres:{db['password']}@{db['host']}:{db['port']}/{db['dbname']}"
engine = create_engine(db_url)
# Test connection
conn = engine.connect()
conn.close()
print(f"🔌 Connected to database: {db['host']}:{db['port']}/{db['dbname']}")
return engine
def get_db_engine() -> Engine:
"""Get a database engine using the configured resource."""
db = wmill.client.get_resource(DB_RESOURCE_PATH)
return create_db_engine(db)
def generic_preprocess(
data: Union[dict, list],
expected_columns: list,
config: dict
) -> list:
"""
Generic data preprocessing function that handles various data structures.
Args:
data: Source data (dict or list)
expected_columns: List of column definitions
config: Preprocessing configuration
Returns:
List of processed records
"""
# Step 1: Follow data path
data_path = config.get("data_path", [])
for key in data_path:
if not isinstance(data, dict):
raise ValueError(f"Expected dict while navigating path, got {type(data)} at key '{key}'")
data = data.get(key)
if data is None:
raise ValueError(f"Missing key '{key}' in data path: {data_path}")
# Step 2: Handle nested structure
nested = config.get("nested", False)
nested_key = config.get("nested_key", None)
id_key = config.get("id_key", None)
flatten = config.get("flatten", False)
records = []
if isinstance(data, dict):
items = data.items()
elif isinstance(data, list):
items = enumerate(data)
else:
raise ValueError(f"Unsupported data structure: {type(data)}")
for outer_key, outer_value in items:
if nested:
if not isinstance(outer_value, list):
continue
for inner_value in outer_value:
if id_key and not inner_value.get(id_key):
continue
additional_data = {nested_key: outer_key} if nested_key else {}
record = build_record_from_config(inner_value, expected_columns, additional_data)
records.append(record)
else:
if not isinstance(outer_value, dict):
continue
if id_key and not outer_value.get(id_key):
continue
if flatten:
nested_data = outer_value.get("identifiers", {})
combined = {**nested_data, "uuid": outer_value.get("uuid")}
record = build_record_from_config(combined, expected_columns)
else:
record = build_record_from_config(outer_value, expected_columns)
records.append(record)
print(f"🔄 Processed {len(records)} records")
return records
def control_batch(data: list, batch_size: int = DEFAULT_BATCH_SIZE):
"""Split data into batches for processing."""
for i in range(0, len(data), batch_size):
yield data[i:i+batch_size]
def insert_data_into_table_batch(records: list, table: str, engine: Engine, batch_size: int = DEFAULT_BATCH_SIZE) -> None:
"""Insert records into database table in batches."""
if not records:
print("⚠️ No records to insert, skipping database operation")
return
print(f"💾 Inserting {len(records)} records into {table} (batch size: {batch_size})")
# Get column names from first record
columns = list(records[0].keys())
column_names = ', '.join(f'"{col}"' for col in columns)
placeholders = ', '.join([f':{col}' for col in columns])
insert_sql = f"INSERT INTO {table} ({column_names}) VALUES ({placeholders})"
with engine.connect() as conn:
batch_count = 0
total_inserted = 0
for batch in control_batch(records, batch_size):
batch_count += 1
batch_size_actual = len(batch)
conn.execute(text(insert_sql), batch)
total_inserted += batch_size_actual
if batch_count % 10 == 0:
print(f"⏳ Inserted {total_inserted}/{len(records)} records...")
conn.commit()
print(f"✅ Inserted {total_inserted} records in {batch_count} batches")
def process_job(job: dict) -> dict:
"""
Process a single ETL job.
Args:
job: Job configuration dictionary
Returns:
Dictionary with job processing results
"""
# Extract job parameters
url = job.get('url')
filename = job.get('filename')
outer_file_type = job.get('outer_file_type')
inner_file_type = job.get('inner_file_type')
table = job.get('table')
expected_columns = job.get('expected_columns')
batch_size = job.get('batch_size', DEFAULT_BATCH_SIZE)
preprocess_function_name = job.get('preprocess_function', 'generic_preprocess')
preprocess_config = job.get('preprocess_config')
active = job.get('active')
iterables = job.get('iterables')
print(f"\n🚀 Processing job for table '{table}'")
if not active:
print(f"⚠️ Job is not active, skipping")
return {"status": "skipped"}
# Get preprocessing function
if isinstance(preprocess_function_name, str):
preprocess_function = globals().get(preprocess_function_name)
if not callable(preprocess_function):
raise ValueError(f"Preprocessing function '{preprocess_function_name}' not found or not callable.")
# Get database engine
engine = get_db_engine()
# Populate download queue
if iterables:
queue = generate_download_queue(url, filename, outer_file_type, iterables)
else:
queue = [{
'url': url,
'filename': filename,
'outer_file_type': outer_file_type,
'inner_file_type': inner_file_type,
'table': table,
'expected_columns': expected_columns
}]
# Process download queue
for queue_item in queue:
content = download_request(queue_item.get('url'), queue_item.get('filename'), queue_item.get('outer_file_type'))
save_file(content, queue_item.get('filename'))
# Handle file extraction if needed
saved_filename = filename
if outer_file_type == 'zip':
saved_filename = unzip_file(filename)
# Load and preprocess data
data = load_file(saved_filename, inner_file_type)
records = preprocess_function(data, expected_columns, preprocess_config)
# Insert data into database
insert_data_into_table_batch(records, table, engine, batch_size)
result = {
"status": "success",
"table": table,
"records_processed": len(records),
"filename": saved_filename
}
print(f"✅ Job complete: {len(records)} records processed for {table}")
return result
def main() -> dict:
"""
Main ETL processing function.
Returns:
Dictionary with overall processing results
"""
print("🎯 ETL Process Starting")
print("=" * 50)
# Load configuration
config_yaml = wmill.get_variable(EXTRACT_CONFIG_PATH)
config = yaml.safe_load(config_yaml)
print(f"📋 Processing {len(config['jobs'])} jobs")
results = []
successful_jobs = 0
failed_jobs = 0
for i, job in enumerate(config['jobs'], 1):
print(f"\n--- Job {i}/{len(config['jobs'])} ---")
try:
result = process_job(job)
results.append(result)
successful_jobs += 1
except Exception as e:
error_result = {
"status": "error",
"table": job.get('table', 'unknown'),
"error": str(e),
"filename": job.get('filename', 'unknown')
}
results.append(error_result)
failed_jobs += 1
print(f"❌ Job {i} failed: {str(e)}")
print(f"\n🏁 ETL Process Complete")
print(f"✅ Successful: {successful_jobs} | ❌ Failed: {failed_jobs} | 📋 Total: {len(results)}")
return {
"status": "completed",
"jobs_processed": len(results),
"successful_jobs": successful_jobs,
"failed_jobs": failed_jobs,
"results": results
}