commit
This commit is contained in:
377
f/CCR_ETL/ccr_etl_mtgjson.py
Normal file
377
f/CCR_ETL/ccr_etl_mtgjson.py
Normal file
@@ -0,0 +1,377 @@
|
||||
"""
|
||||
CCR ETL MTGJSON Processing Script
|
||||
|
||||
This script handles the extraction, transformation, and loading of MTGJSON data
|
||||
into a PostgreSQL database. It supports downloading, unzipping, preprocessing,
|
||||
and batch inserting of various data formats.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import yaml
|
||||
from typing import Union
|
||||
from zipfile import ZipFile
|
||||
|
||||
import psycopg2
|
||||
import requests
|
||||
import wmill
|
||||
from sqlalchemy import create_engine, text
|
||||
from sqlalchemy.engine import Engine
|
||||
|
||||
# Configuration paths
|
||||
DB_RESOURCE_PATH = 'u/joshuakrzemien/slick_postgresql'
|
||||
DB_CONFIG_PATH = 'f/CCR_ETL/ccr_db_config'
|
||||
EXTRACT_CONFIG_PATH = 'f/CCR_ETL/ccr_extract_config'
|
||||
DOWNLOAD_CONFIG_PATH = './shared/'
|
||||
|
||||
# Default processing parameters
|
||||
DEFAULT_BATCH_SIZE = 1000
|
||||
|
||||
def validate_response_headers(response: requests.Response, outer_file_type: str) -> None:
|
||||
"""Validate that the response content type matches the expected file type."""
|
||||
if response.headers['Content-Type'] != f'application/{outer_file_type}':
|
||||
raise ValueError(f"Expected {outer_file_type} response, got {response.headers['Content-Type']}")
|
||||
|
||||
|
||||
def download_request(url: str, filename: str, outer_file_type: str) -> bytes:
|
||||
"""Download a file from the given URL and validate its content type."""
|
||||
print(f"🔽 Downloading {filename} from {url}")
|
||||
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
validate_response_headers(response, outer_file_type)
|
||||
|
||||
print(f"✅ Download successful ({response.headers.get('Content-Length', 'Unknown')} bytes)")
|
||||
return response.content
|
||||
|
||||
def generate_download_queue(url: str, filename: str, outer_file_type: str, iterables: dict) -> list:
|
||||
"""
|
||||
Generate a queue of download items based on URL templates and iterable values.
|
||||
|
||||
Example:
|
||||
url = "https://tcgcsv.com/tcgplayer/{game_id}/groups"
|
||||
iterables = {'game_id': [1,3,65,71,86]}
|
||||
"""
|
||||
queue = []
|
||||
for key, value in iterables.items():
|
||||
for item in value:
|
||||
queue_item = {
|
||||
'url': url.format(key=key, value=item),
|
||||
'filename': filename.format(key=key, value=item),
|
||||
'outer_file_type': outer_file_type,
|
||||
}
|
||||
queue.append(queue_item)
|
||||
return queue
|
||||
|
||||
|
||||
def save_file(content: bytes, filename: str) -> None:
|
||||
"""Save binary content to a file in the download directory."""
|
||||
filepath = DOWNLOAD_CONFIG_PATH + filename
|
||||
with open(filepath, 'wb') as f:
|
||||
f.write(content)
|
||||
print(f"💾 Saved {len(content)} bytes to {filename}")
|
||||
|
||||
|
||||
def unzip_file(filename: str) -> str:
|
||||
"""Extract a zip file and return the name of the extracted content."""
|
||||
new_filename = filename.replace('.zip', '')
|
||||
zip_path = DOWNLOAD_CONFIG_PATH + filename
|
||||
|
||||
with ZipFile(zip_path, 'r') as zip_ref:
|
||||
file_list = zip_ref.namelist()
|
||||
print(f"📦 Extracting {len(file_list)} files from {filename}")
|
||||
zip_ref.extractall(DOWNLOAD_CONFIG_PATH)
|
||||
|
||||
return new_filename
|
||||
|
||||
def load_file(filename: str, file_type: str) -> Union[dict, list]:
|
||||
"""Load and parse a file from the download directory."""
|
||||
filepath = DOWNLOAD_CONFIG_PATH + filename
|
||||
|
||||
if file_type == 'json':
|
||||
with open(filepath, 'r') as f:
|
||||
data = json.load(f)
|
||||
print(f"📖 Loaded {file_type} file: {filename}")
|
||||
return data
|
||||
else:
|
||||
raise ValueError(f"Unsupported file type: {file_type}")
|
||||
|
||||
def build_record_from_config(source_data: dict, expected_columns: list, additional_data: dict = None) -> dict:
|
||||
"""
|
||||
Build a record using the structure defined in the extract config.
|
||||
|
||||
Args:
|
||||
source_data: The source data dictionary
|
||||
expected_columns: List of column definitions from config
|
||||
additional_data: Optional additional data to merge (e.g., parent UUID)
|
||||
|
||||
Returns:
|
||||
Dictionary representing a single database record
|
||||
"""
|
||||
if additional_data is None:
|
||||
additional_data = {}
|
||||
|
||||
# Merge source data with additional data (like uuid from parent structure)
|
||||
combined_data = {**source_data, **additional_data}
|
||||
|
||||
record = {}
|
||||
for column in expected_columns:
|
||||
col_name = column['name']
|
||||
# Skip auto-increment columns (like 'id')
|
||||
if column.get('auto_increment', False):
|
||||
continue
|
||||
# Get value from combined data, use empty string as default
|
||||
record[col_name] = combined_data.get(col_name, '')
|
||||
|
||||
return record
|
||||
|
||||
def create_db_engine(db: dict) -> Engine:
|
||||
"""Create and test a database engine connection."""
|
||||
db_url = f"postgresql+psycopg2://postgres:{db['password']}@{db['host']}:{db['port']}/{db['dbname']}"
|
||||
engine = create_engine(db_url)
|
||||
|
||||
# Test connection
|
||||
conn = engine.connect()
|
||||
conn.close()
|
||||
print(f"🔌 Connected to database: {db['host']}:{db['port']}/{db['dbname']}")
|
||||
|
||||
return engine
|
||||
|
||||
|
||||
def get_db_engine() -> Engine:
|
||||
"""Get a database engine using the configured resource."""
|
||||
db = wmill.client.get_resource(DB_RESOURCE_PATH)
|
||||
return create_db_engine(db)
|
||||
|
||||
def generic_preprocess(
|
||||
data: Union[dict, list],
|
||||
expected_columns: list,
|
||||
config: dict
|
||||
) -> list:
|
||||
"""
|
||||
Generic data preprocessing function that handles various data structures.
|
||||
|
||||
Args:
|
||||
data: Source data (dict or list)
|
||||
expected_columns: List of column definitions
|
||||
config: Preprocessing configuration
|
||||
|
||||
Returns:
|
||||
List of processed records
|
||||
"""
|
||||
# Step 1: Follow data path
|
||||
data_path = config.get("data_path", [])
|
||||
for key in data_path:
|
||||
if not isinstance(data, dict):
|
||||
raise ValueError(f"Expected dict while navigating path, got {type(data)} at key '{key}'")
|
||||
data = data.get(key)
|
||||
if data is None:
|
||||
raise ValueError(f"Missing key '{key}' in data path: {data_path}")
|
||||
|
||||
# Step 2: Handle nested structure
|
||||
nested = config.get("nested", False)
|
||||
nested_key = config.get("nested_key", None)
|
||||
id_key = config.get("id_key", None)
|
||||
flatten = config.get("flatten", False)
|
||||
|
||||
records = []
|
||||
|
||||
if isinstance(data, dict):
|
||||
items = data.items()
|
||||
elif isinstance(data, list):
|
||||
items = enumerate(data)
|
||||
else:
|
||||
raise ValueError(f"Unsupported data structure: {type(data)}")
|
||||
|
||||
for outer_key, outer_value in items:
|
||||
if nested:
|
||||
if not isinstance(outer_value, list):
|
||||
continue
|
||||
for inner_value in outer_value:
|
||||
if id_key and not inner_value.get(id_key):
|
||||
continue
|
||||
additional_data = {nested_key: outer_key} if nested_key else {}
|
||||
record = build_record_from_config(inner_value, expected_columns, additional_data)
|
||||
records.append(record)
|
||||
else:
|
||||
if not isinstance(outer_value, dict):
|
||||
continue
|
||||
if id_key and not outer_value.get(id_key):
|
||||
continue
|
||||
if flatten:
|
||||
nested_data = outer_value.get("identifiers", {})
|
||||
combined = {**nested_data, "uuid": outer_value.get("uuid")}
|
||||
record = build_record_from_config(combined, expected_columns)
|
||||
else:
|
||||
record = build_record_from_config(outer_value, expected_columns)
|
||||
records.append(record)
|
||||
|
||||
print(f"🔄 Processed {len(records)} records")
|
||||
return records
|
||||
|
||||
def control_batch(data: list, batch_size: int = DEFAULT_BATCH_SIZE):
|
||||
"""Split data into batches for processing."""
|
||||
for i in range(0, len(data), batch_size):
|
||||
yield data[i:i+batch_size]
|
||||
|
||||
|
||||
def insert_data_into_table_batch(records: list, table: str, engine: Engine, batch_size: int = DEFAULT_BATCH_SIZE) -> None:
|
||||
"""Insert records into database table in batches."""
|
||||
if not records:
|
||||
print("⚠️ No records to insert, skipping database operation")
|
||||
return
|
||||
|
||||
print(f"💾 Inserting {len(records)} records into {table} (batch size: {batch_size})")
|
||||
|
||||
# Get column names from first record
|
||||
columns = list(records[0].keys())
|
||||
column_names = ', '.join(f'"{col}"' for col in columns)
|
||||
placeholders = ', '.join([f':{col}' for col in columns])
|
||||
|
||||
insert_sql = f"INSERT INTO {table} ({column_names}) VALUES ({placeholders})"
|
||||
|
||||
with engine.connect() as conn:
|
||||
batch_count = 0
|
||||
total_inserted = 0
|
||||
|
||||
for batch in control_batch(records, batch_size):
|
||||
batch_count += 1
|
||||
batch_size_actual = len(batch)
|
||||
|
||||
conn.execute(text(insert_sql), batch)
|
||||
total_inserted += batch_size_actual
|
||||
|
||||
if batch_count % 10 == 0:
|
||||
print(f"⏳ Inserted {total_inserted}/{len(records)} records...")
|
||||
|
||||
conn.commit()
|
||||
print(f"✅ Inserted {total_inserted} records in {batch_count} batches")
|
||||
|
||||
|
||||
def process_job(job: dict) -> dict:
|
||||
"""
|
||||
Process a single ETL job.
|
||||
|
||||
Args:
|
||||
job: Job configuration dictionary
|
||||
|
||||
Returns:
|
||||
Dictionary with job processing results
|
||||
"""
|
||||
# Extract job parameters
|
||||
url = job.get('url')
|
||||
filename = job.get('filename')
|
||||
outer_file_type = job.get('outer_file_type')
|
||||
inner_file_type = job.get('inner_file_type')
|
||||
table = job.get('table')
|
||||
expected_columns = job.get('expected_columns')
|
||||
batch_size = job.get('batch_size', DEFAULT_BATCH_SIZE)
|
||||
preprocess_function_name = job.get('preprocess_function', 'generic_preprocess')
|
||||
preprocess_config = job.get('preprocess_config')
|
||||
active = job.get('active')
|
||||
iterables = job.get('iterables')
|
||||
|
||||
print(f"\n🚀 Processing job for table '{table}'")
|
||||
|
||||
if not active:
|
||||
print(f"⚠️ Job is not active, skipping")
|
||||
return {"status": "skipped"}
|
||||
|
||||
# Get preprocessing function
|
||||
if isinstance(preprocess_function_name, str):
|
||||
preprocess_function = globals().get(preprocess_function_name)
|
||||
if not callable(preprocess_function):
|
||||
raise ValueError(f"Preprocessing function '{preprocess_function_name}' not found or not callable.")
|
||||
|
||||
# Get database engine
|
||||
engine = get_db_engine()
|
||||
|
||||
# Populate download queue
|
||||
if iterables:
|
||||
queue = generate_download_queue(url, filename, outer_file_type, iterables)
|
||||
else:
|
||||
queue = [{
|
||||
'url': url,
|
||||
'filename': filename,
|
||||
'outer_file_type': outer_file_type,
|
||||
'inner_file_type': inner_file_type,
|
||||
'table': table,
|
||||
'expected_columns': expected_columns
|
||||
}]
|
||||
|
||||
# Process download queue
|
||||
for queue_item in queue:
|
||||
content = download_request(queue_item.get('url'), queue_item.get('filename'), queue_item.get('outer_file_type'))
|
||||
save_file(content, queue_item.get('filename'))
|
||||
|
||||
# Handle file extraction if needed
|
||||
saved_filename = filename
|
||||
if outer_file_type == 'zip':
|
||||
saved_filename = unzip_file(filename)
|
||||
|
||||
# Load and preprocess data
|
||||
data = load_file(saved_filename, inner_file_type)
|
||||
records = preprocess_function(data, expected_columns, preprocess_config)
|
||||
|
||||
# Insert data into database
|
||||
insert_data_into_table_batch(records, table, engine, batch_size)
|
||||
|
||||
result = {
|
||||
"status": "success",
|
||||
"table": table,
|
||||
"records_processed": len(records),
|
||||
"filename": saved_filename
|
||||
}
|
||||
|
||||
print(f"✅ Job complete: {len(records)} records processed for {table}")
|
||||
return result
|
||||
|
||||
|
||||
|
||||
def main() -> dict:
|
||||
"""
|
||||
Main ETL processing function.
|
||||
|
||||
Returns:
|
||||
Dictionary with overall processing results
|
||||
"""
|
||||
print("🎯 ETL Process Starting")
|
||||
print("=" * 50)
|
||||
|
||||
# Load configuration
|
||||
config_yaml = wmill.get_variable(EXTRACT_CONFIG_PATH)
|
||||
config = yaml.safe_load(config_yaml)
|
||||
print(f"📋 Processing {len(config['jobs'])} jobs")
|
||||
|
||||
results = []
|
||||
successful_jobs = 0
|
||||
failed_jobs = 0
|
||||
|
||||
for i, job in enumerate(config['jobs'], 1):
|
||||
print(f"\n--- Job {i}/{len(config['jobs'])} ---")
|
||||
|
||||
try:
|
||||
result = process_job(job)
|
||||
results.append(result)
|
||||
successful_jobs += 1
|
||||
except Exception as e:
|
||||
error_result = {
|
||||
"status": "error",
|
||||
"table": job.get('table', 'unknown'),
|
||||
"error": str(e),
|
||||
"filename": job.get('filename', 'unknown')
|
||||
}
|
||||
results.append(error_result)
|
||||
failed_jobs += 1
|
||||
print(f"❌ Job {i} failed: {str(e)}")
|
||||
|
||||
print(f"\n🏁 ETL Process Complete")
|
||||
print(f"✅ Successful: {successful_jobs} | ❌ Failed: {failed_jobs} | 📋 Total: {len(results)}")
|
||||
|
||||
return {
|
||||
"status": "completed",
|
||||
"jobs_processed": len(results),
|
||||
"successful_jobs": successful_jobs,
|
||||
"failed_jobs": failed_jobs,
|
||||
"results": results
|
||||
}
|
Reference in New Issue
Block a user