This commit is contained in:
2025-09-09 12:43:38 -04:00
parent 698ec83c96
commit a73ec73921
14 changed files with 2646 additions and 0 deletions

View File

@@ -0,0 +1,189 @@
description: ''
value: |-
# Global table strategy (can be overridden per table)
table_strategy: drop_and_recreate
schema:
name: ccr_etl_raw
version: 1.0.0
description: CCR ETL Raw Data Schema
tables:
- name: mtgjson_skus
strategy: drop_and_recreate
columns:
- name: id
type: integer
description: internal database id
primary_key: true
autoincrement: true
- name: uuid
type: string
description: The UUID of the MTGJSON SKU
- name: condition
type: string
description: The condition of the MTGJSON SKU
- name: language
type: string
description: The language of the MTGJSON SKU
- name: printing
type: string
description: The printing of the MTGJSON SKU
- name: finish
type: string
description: The finish of the MTGJSON SKU
- name: productId
type: string
description: The tcgplayer product ID of the MTGJSON SKU
- name: skuId
type: string
description: The tcgplayer SKU ID of the MTGJSON SKU
- name: mtgjson_identifiers
strategy: drop_and_recreate
columns:
- name: id
type: integer
description: internal database id
primary_key: true
autoincrement: true
- name: uuid
type: string
description: The UUID of the MTGJSON Identifier
- name: name
type: string
description: The name of the MTGJSON Identifier
- name: setCode
type: string
description: The set code of the MTGJSON Identifier
- name: abuId
type: string
description: The Abu Games ID
- name: cardKingdomEtchedId
type: string
description: The Card Kingdom Etched ID
- name: cardKingdomFoilId
type: string
description: The Card Kingdom Foil ID
- name: cardKingdomId
type: string
description: The Card Kingdom ID
- name: cardsphereId
type: string
description: The Cardsphere ID
- name: cardsphereFoilId
type: string
description: The Cardsphere Foil ID
- name: cardtraderId
type: string
description: The Cardtrader ID
- name: csiId
type: string
description: The cool stuff inc ID
- name: mcmId
type: string
description: The cardmarket ID
- name: mcmMetaId
type: string
description: The cardmarket meta ID
- name: miniaturemarketId
type: string
description: The miniaturemarket ID
- name: mtgArenaId
type: string
description: The mtg arena ID
- name: mtgjsonFoilVersionId
type: string
description: The uuid generated by mtgjson for the foil version of a card
- name: mtgjsonNonFoilVersionId
type: string
description: The uuid generated by mtgjson for the non-foil version of a card
- name: mtgjsonV4Id
type: string
description: The uuid generated by mtgjson a card
- name: mtgoFoilId
type: string
description: The mtgo foil ID
- name: mtgoId
type: string
description: The mtgo ID
- name: multiverseId
type: string
description: The multiverse ID used by wotc for gatherer
- name: scgId
type: string
description: The starcitygames ID
- name: scryfallId
type: string
description: The scryfall ID
- name: scryfallCardBackId
type: string
description: The scryfall card back ID
- name: scryfallOracleId
type: string
description: The scryfall oracle ID
- name: scryfallIllustrationId
type: string
description: The scryfall illustration ID
- name: tcgplayerProductId
type: string
description: The tcgplayer product ID
- name: tcgplayerEtchedProductId
type: string
description: The tcgplayer etched product ID
- name: tntId
type: string
description: The troll and toad ID
- name: tcgcsv_categories
strategy: drop_and_recreate
columns:
- name: id
type: integer
description: internal database id
primary_key: true
autoincrement: true
- name: categoryId
type: integer
- name: name
type: string
- name: modifiedOn
type: string
- name: displayName
type: string
- name: seoCategoryName
type: string
- name: categoryDescription
type: string
- name: categoryPageTitle
type: string
- name: sealedLabel
type: string
- name: nonSealedLabel
type: string
- name: conditionGuideUrl
type: string
- name: isScannable
type: boolean
- name: popularity
type: integer
- name: isDirect
type: boolean
- name: tcgcsv_groups
strategy: drop_and_recreate
columns:
- name: id
type: integer
primary_key: true
autoincrement: true
- name: groupId
type: integer
- name: name
type: string
- name: abbreviation
type: string
- name: isSupplemental
type: boolean
- name: publishedOn
type: string
- name: modifiedOn
type: string
- name: categoryId
type: integer
is_secret: false

View File

@@ -0,0 +1,89 @@
import os
import wmill
import yaml
from sqlalchemy import create_engine, text, MetaData, Table, Column, Integer, String, inspect
from sqlalchemy.engine import Engine
import psycopg2
# You can import any PyPi package.
# See here for more info: https://www.windmill.dev/docs/advanced/dependencies_in_python
# you can use typed resources by doing a type alias to dict
#postgresql = dict
DB_RESOURCE_PATH = 'u/joshuakrzemien/slick_postgresql'
DB_CONFIG_PATH = 'f/CCR_ETL/ccr_db_config'
def create_db_engine(db: dict):
db_url = f"postgresql+psycopg2://postgres:{db['password']}@{db['host']}:{db['port']}/{db['dbname']}"
engine = create_engine(db_url)
engine.connect()
return engine
def table_exists(engine: Engine, table_name: str) -> bool:
"""Check if a table exists in the database."""
inspector = inspect(engine)
return table_name in inspector.get_table_names()
def create_table(engine: Engine, table: dict, strategy: str = "create_if_not_exists"):
try:
table_name = table['name']
columns = table['columns']
# Handle different table strategies
if strategy == "drop_and_recreate":
if table_exists(engine, table_name):
print(f"Dropping existing table: {table_name}")
with engine.connect() as conn:
conn.execute(text(f"DROP TABLE IF EXISTS {table_name} CASCADE"))
conn.commit()
elif strategy == "create_if_not_exists":
if table_exists(engine, table_name):
print(f"Table {table_name} already exists, skipping creation")
return
else:
raise ValueError(f"Unknown table strategy: {strategy}")
# Map config types to SQLAlchemy types
type_mapping = {
'integer': Integer,
'string': String
}
# Build SQLAlchemy columns
sqlalchemy_columns = []
for column in columns:
col_type = type_mapping.get(column['type'], String)
sqlalchemy_columns.append(Column(column['name'], col_type, primary_key=column.get('primary_key', False), nullable=column.get('nullable', True), index=column.get('index', False), autoincrement=column.get('autoincrement', False)))
# Create table using SQLAlchemy Core
metadata = MetaData()
new_table = Table(table_name, metadata, *sqlalchemy_columns)
# Create the table
metadata.create_all(engine)
print(f"Successfully created table: {table_name}")
except Exception as e:
print(f"Error creating table {table_name}: {str(e)}")
raise
def main():
db = wmill.client.get_resource(DB_RESOURCE_PATH)
config_yaml = wmill.get_variable(DB_CONFIG_PATH)
config = yaml.safe_load(config_yaml)
engine = create_db_engine(db)
# Get table strategy from config (default to drop_and_recreate)
table_strategy = config.get('table_strategy', 'drop_and_recreate')
print(f"Using table strategy: {table_strategy}")
for table in config['schema']['tables']:
# Allow per-table strategy override
table_specific_strategy = table.get('strategy', table_strategy)
create_table(engine, table, table_specific_strategy)
return {"status": "success"}

View File

@@ -0,0 +1,14 @@
# py: 3.11
anyio==4.10.0
certifi==2025.8.3
greenlet==3.2.4
h11==0.16.0
httpcore==1.0.9
httpx==0.28.1
idna==3.10
psycopg2-binary==2.9.10
pyyaml==6.0.2
sniffio==1.3.1
sqlalchemy==2.0.43
typing-extensions==4.15.0
wmill==1.538.0

View File

@@ -0,0 +1,9 @@
summary: ''
description: ''
lock: '!inline f/CCR_ETL/ccr_etl_db_init.script.lock'
kind: script
schema:
$schema: 'https://json-schema.org/draft/2020-12/schema'
type: object
properties: {}
required: []

View File

@@ -0,0 +1,377 @@
"""
CCR ETL MTGJSON Processing Script
This script handles the extraction, transformation, and loading of MTGJSON data
into a PostgreSQL database. It supports downloading, unzipping, preprocessing,
and batch inserting of various data formats.
"""
import json
import os
import yaml
from typing import Union
from zipfile import ZipFile
import psycopg2
import requests
import wmill
from sqlalchemy import create_engine, text
from sqlalchemy.engine import Engine
# Configuration paths
DB_RESOURCE_PATH = 'u/joshuakrzemien/slick_postgresql'
DB_CONFIG_PATH = 'f/CCR_ETL/ccr_db_config'
EXTRACT_CONFIG_PATH = 'f/CCR_ETL/ccr_extract_config'
DOWNLOAD_CONFIG_PATH = './shared/'
# Default processing parameters
DEFAULT_BATCH_SIZE = 1000
def validate_response_headers(response: requests.Response, outer_file_type: str) -> None:
"""Validate that the response content type matches the expected file type."""
if response.headers['Content-Type'] != f'application/{outer_file_type}':
raise ValueError(f"Expected {outer_file_type} response, got {response.headers['Content-Type']}")
def download_request(url: str, filename: str, outer_file_type: str) -> bytes:
"""Download a file from the given URL and validate its content type."""
print(f"🔽 Downloading {filename} from {url}")
response = requests.get(url)
response.raise_for_status()
validate_response_headers(response, outer_file_type)
print(f"✅ Download successful ({response.headers.get('Content-Length', 'Unknown')} bytes)")
return response.content
def generate_download_queue(url: str, filename: str, outer_file_type: str, iterables: dict) -> list:
"""
Generate a queue of download items based on URL templates and iterable values.
Example:
url = "https://tcgcsv.com/tcgplayer/{game_id}/groups"
iterables = {'game_id': [1,3,65,71,86]}
"""
queue = []
for key, value in iterables.items():
for item in value:
queue_item = {
'url': url.format(key=key, value=item),
'filename': filename.format(key=key, value=item),
'outer_file_type': outer_file_type,
}
queue.append(queue_item)
return queue
def save_file(content: bytes, filename: str) -> None:
"""Save binary content to a file in the download directory."""
filepath = DOWNLOAD_CONFIG_PATH + filename
with open(filepath, 'wb') as f:
f.write(content)
print(f"💾 Saved {len(content)} bytes to {filename}")
def unzip_file(filename: str) -> str:
"""Extract a zip file and return the name of the extracted content."""
new_filename = filename.replace('.zip', '')
zip_path = DOWNLOAD_CONFIG_PATH + filename
with ZipFile(zip_path, 'r') as zip_ref:
file_list = zip_ref.namelist()
print(f"📦 Extracting {len(file_list)} files from {filename}")
zip_ref.extractall(DOWNLOAD_CONFIG_PATH)
return new_filename
def load_file(filename: str, file_type: str) -> Union[dict, list]:
"""Load and parse a file from the download directory."""
filepath = DOWNLOAD_CONFIG_PATH + filename
if file_type == 'json':
with open(filepath, 'r') as f:
data = json.load(f)
print(f"📖 Loaded {file_type} file: {filename}")
return data
else:
raise ValueError(f"Unsupported file type: {file_type}")
def build_record_from_config(source_data: dict, expected_columns: list, additional_data: dict = None) -> dict:
"""
Build a record using the structure defined in the extract config.
Args:
source_data: The source data dictionary
expected_columns: List of column definitions from config
additional_data: Optional additional data to merge (e.g., parent UUID)
Returns:
Dictionary representing a single database record
"""
if additional_data is None:
additional_data = {}
# Merge source data with additional data (like uuid from parent structure)
combined_data = {**source_data, **additional_data}
record = {}
for column in expected_columns:
col_name = column['name']
# Skip auto-increment columns (like 'id')
if column.get('auto_increment', False):
continue
# Get value from combined data, use empty string as default
record[col_name] = combined_data.get(col_name, '')
return record
def create_db_engine(db: dict) -> Engine:
"""Create and test a database engine connection."""
db_url = f"postgresql+psycopg2://postgres:{db['password']}@{db['host']}:{db['port']}/{db['dbname']}"
engine = create_engine(db_url)
# Test connection
conn = engine.connect()
conn.close()
print(f"🔌 Connected to database: {db['host']}:{db['port']}/{db['dbname']}")
return engine
def get_db_engine() -> Engine:
"""Get a database engine using the configured resource."""
db = wmill.client.get_resource(DB_RESOURCE_PATH)
return create_db_engine(db)
def generic_preprocess(
data: Union[dict, list],
expected_columns: list,
config: dict
) -> list:
"""
Generic data preprocessing function that handles various data structures.
Args:
data: Source data (dict or list)
expected_columns: List of column definitions
config: Preprocessing configuration
Returns:
List of processed records
"""
# Step 1: Follow data path
data_path = config.get("data_path", [])
for key in data_path:
if not isinstance(data, dict):
raise ValueError(f"Expected dict while navigating path, got {type(data)} at key '{key}'")
data = data.get(key)
if data is None:
raise ValueError(f"Missing key '{key}' in data path: {data_path}")
# Step 2: Handle nested structure
nested = config.get("nested", False)
nested_key = config.get("nested_key", None)
id_key = config.get("id_key", None)
flatten = config.get("flatten", False)
records = []
if isinstance(data, dict):
items = data.items()
elif isinstance(data, list):
items = enumerate(data)
else:
raise ValueError(f"Unsupported data structure: {type(data)}")
for outer_key, outer_value in items:
if nested:
if not isinstance(outer_value, list):
continue
for inner_value in outer_value:
if id_key and not inner_value.get(id_key):
continue
additional_data = {nested_key: outer_key} if nested_key else {}
record = build_record_from_config(inner_value, expected_columns, additional_data)
records.append(record)
else:
if not isinstance(outer_value, dict):
continue
if id_key and not outer_value.get(id_key):
continue
if flatten:
nested_data = outer_value.get("identifiers", {})
combined = {**nested_data, "uuid": outer_value.get("uuid")}
record = build_record_from_config(combined, expected_columns)
else:
record = build_record_from_config(outer_value, expected_columns)
records.append(record)
print(f"🔄 Processed {len(records)} records")
return records
def control_batch(data: list, batch_size: int = DEFAULT_BATCH_SIZE):
"""Split data into batches for processing."""
for i in range(0, len(data), batch_size):
yield data[i:i+batch_size]
def insert_data_into_table_batch(records: list, table: str, engine: Engine, batch_size: int = DEFAULT_BATCH_SIZE) -> None:
"""Insert records into database table in batches."""
if not records:
print("⚠️ No records to insert, skipping database operation")
return
print(f"💾 Inserting {len(records)} records into {table} (batch size: {batch_size})")
# Get column names from first record
columns = list(records[0].keys())
column_names = ', '.join(f'"{col}"' for col in columns)
placeholders = ', '.join([f':{col}' for col in columns])
insert_sql = f"INSERT INTO {table} ({column_names}) VALUES ({placeholders})"
with engine.connect() as conn:
batch_count = 0
total_inserted = 0
for batch in control_batch(records, batch_size):
batch_count += 1
batch_size_actual = len(batch)
conn.execute(text(insert_sql), batch)
total_inserted += batch_size_actual
if batch_count % 10 == 0:
print(f"⏳ Inserted {total_inserted}/{len(records)} records...")
conn.commit()
print(f"✅ Inserted {total_inserted} records in {batch_count} batches")
def process_job(job: dict) -> dict:
"""
Process a single ETL job.
Args:
job: Job configuration dictionary
Returns:
Dictionary with job processing results
"""
# Extract job parameters
url = job.get('url')
filename = job.get('filename')
outer_file_type = job.get('outer_file_type')
inner_file_type = job.get('inner_file_type')
table = job.get('table')
expected_columns = job.get('expected_columns')
batch_size = job.get('batch_size', DEFAULT_BATCH_SIZE)
preprocess_function_name = job.get('preprocess_function', 'generic_preprocess')
preprocess_config = job.get('preprocess_config')
active = job.get('active')
iterables = job.get('iterables')
print(f"\n🚀 Processing job for table '{table}'")
if not active:
print(f"⚠️ Job is not active, skipping")
return {"status": "skipped"}
# Get preprocessing function
if isinstance(preprocess_function_name, str):
preprocess_function = globals().get(preprocess_function_name)
if not callable(preprocess_function):
raise ValueError(f"Preprocessing function '{preprocess_function_name}' not found or not callable.")
# Get database engine
engine = get_db_engine()
# Populate download queue
if iterables:
queue = generate_download_queue(url, filename, outer_file_type, iterables)
else:
queue = [{
'url': url,
'filename': filename,
'outer_file_type': outer_file_type,
'inner_file_type': inner_file_type,
'table': table,
'expected_columns': expected_columns
}]
# Process download queue
for queue_item in queue:
content = download_request(queue_item.get('url'), queue_item.get('filename'), queue_item.get('outer_file_type'))
save_file(content, queue_item.get('filename'))
# Handle file extraction if needed
saved_filename = filename
if outer_file_type == 'zip':
saved_filename = unzip_file(filename)
# Load and preprocess data
data = load_file(saved_filename, inner_file_type)
records = preprocess_function(data, expected_columns, preprocess_config)
# Insert data into database
insert_data_into_table_batch(records, table, engine, batch_size)
result = {
"status": "success",
"table": table,
"records_processed": len(records),
"filename": saved_filename
}
print(f"✅ Job complete: {len(records)} records processed for {table}")
return result
def main() -> dict:
"""
Main ETL processing function.
Returns:
Dictionary with overall processing results
"""
print("🎯 ETL Process Starting")
print("=" * 50)
# Load configuration
config_yaml = wmill.get_variable(EXTRACT_CONFIG_PATH)
config = yaml.safe_load(config_yaml)
print(f"📋 Processing {len(config['jobs'])} jobs")
results = []
successful_jobs = 0
failed_jobs = 0
for i, job in enumerate(config['jobs'], 1):
print(f"\n--- Job {i}/{len(config['jobs'])} ---")
try:
result = process_job(job)
results.append(result)
successful_jobs += 1
except Exception as e:
error_result = {
"status": "error",
"table": job.get('table', 'unknown'),
"error": str(e),
"filename": job.get('filename', 'unknown')
}
results.append(error_result)
failed_jobs += 1
print(f"❌ Job {i} failed: {str(e)}")
print(f"\n🏁 ETL Process Complete")
print(f"✅ Successful: {successful_jobs} | ❌ Failed: {failed_jobs} | 📋 Total: {len(results)}")
return {
"status": "completed",
"jobs_processed": len(results),
"successful_jobs": successful_jobs,
"failed_jobs": failed_jobs,
"results": results
}

View File

@@ -0,0 +1,17 @@
# py: 3.11
anyio==4.10.0
certifi==2025.8.3
charset-normalizer==3.4.3
greenlet==3.2.4
h11==0.16.0
httpcore==1.0.9
httpx==0.28.1
idna==3.10
psycopg2-binary==2.9.10
pyyaml==6.0.2
requests==2.32.5
sniffio==1.3.1
sqlalchemy==2.0.43
typing-extensions==4.15.0
urllib3==2.5.0
wmill==1.539.1

View File

@@ -0,0 +1,9 @@
summary: CCR ETL MTGJSON
description: ''
lock: '!inline f/CCR_ETL/ccr_etl_mtgjson.script.lock'
kind: script
schema:
$schema: 'https://json-schema.org/draft/2020-12/schema'
type: object
properties: {}
required: []

View File

@@ -0,0 +1,162 @@
description: ''
value: |-
jobs:
- name: mtgjson_skus
active: true
url: https://mtgjson.com/api/v5/TcgplayerSkus.json.zip
filename: TcgplayerSkus.json.zip
outer_file_type: zip
inner_file_type: json
preprocess_config:
data_path: ["data"]
nested: true
nested_key: "uuid"
id_key: "skuId"
table: mtgjson_skus
batch_size: 1000
expected_columns:
- name: uuid
type: string
- name: condition
type: string
- name: language
type: string
- name: printing
type: string
- name: finish
type: string
- name: productId
type: string
- name: skuId
type: string
cache:
status: true
ttl: 86400
- name: mtgjson_identifiers
active: true
url: https://mtgjson.com/api/v5/AllIdentifiers.json.zip
filename: AllIdentifiers.json.zip
outer_file_type: zip
inner_file_type: json
preprocess_config:
data_path: ["data"]
nested: false
flatten: true
id_key: "uuid"
table: mtgjson_identifiers
batch_size: 1000
expected_columns:
- name: uuid
type: string
- name: name
type: string
- name: setCode
type: string
- name: abuId
type: string
- name: cardKingdomEtchedId
type: string
- name: cardKingdomFoilId
type: string
- name: cardKingdomId
type: string
- name: cardsphereId
type: string
- name: cardsphereFoilId
type: string
- name: cardtraderId
type: string
- name: csiId
type: string
- name: mcmId
type: string
- name: mcmMetaId
type: string
- name: miniaturemarketId
type: string
- name: mtgArenaId
type: string
- name: mtgjsonFoilVersionId
type: string
- name: mtgjsonNonFoilVersionId
type: string
- name: mtgjsonV4Id
type: string
- name: mtgoFoilId
type: string
- name: mtgoId
type: string
- name: multiverseId
type: string
- name: scgId
type: string
- name: scryfallId
type: string
- name: scryfallCardBackId
type: string
- name: scryfallOracleId
type: string
- name: scryfallIllustrationId
type: string
- name: tcgplayerProductId
type: string
- name: tcgplayerEtchedProductId
type: string
- name: tntId
type: string
cache:
status: true
ttl: 86400
- name: tcgcsv_categories
active: true
url: https://tcgcsv.com/tcgplayer/categories
outer_file_type: json
preprocess_config:
data_path: ["results"]
nested: false
filename: tcgplayer_categories.json
expected_columns:
- name: categoryId
type: integer
- name: name
type: string
- name: modifiedOn
type: string
- name: displayName
type: string
- name: seoCategoryName
type: string
- name: categoryDescription
type: string
- name: categoryPageTitle
type: string
- name: sealedLabel
type: string
- name: nonSealedLabel
type: string
- name: conditionGuideUrl
type: string
- name: isScannable
type: boolean
- name: popularity
type: integer
- name: isDirect
type: boolean
- name: tcgcsv_groups
active: true
url: https://tcgcsv.com/tcgplayer/{game_id}/groups
outer_file_type: json
preprocess_config:
data_path: ["results"]
nested: false
filename: tcgplayer_{game_id}_groups.json
expected_columns:
- name: groupId
type: integer
- name: name
type: string
- name: modifiedOn
type: string
iterables:
game_id: [1,3,65,71,86]
is_secret: false

View File

@@ -0,0 +1,6 @@
summary: null
display_name: CCR_ETL
extra_perms:
u/joshuakrzemien: true
owners:
- u/joshuakrzemien