commit
This commit is contained in:
189
f/CCR_ETL/ccr_db_config.variable.yaml
Normal file
189
f/CCR_ETL/ccr_db_config.variable.yaml
Normal file
@@ -0,0 +1,189 @@
|
||||
description: ''
|
||||
value: |-
|
||||
# Global table strategy (can be overridden per table)
|
||||
table_strategy: drop_and_recreate
|
||||
schema:
|
||||
name: ccr_etl_raw
|
||||
version: 1.0.0
|
||||
description: CCR ETL Raw Data Schema
|
||||
tables:
|
||||
- name: mtgjson_skus
|
||||
strategy: drop_and_recreate
|
||||
columns:
|
||||
- name: id
|
||||
type: integer
|
||||
description: internal database id
|
||||
primary_key: true
|
||||
autoincrement: true
|
||||
- name: uuid
|
||||
type: string
|
||||
description: The UUID of the MTGJSON SKU
|
||||
- name: condition
|
||||
type: string
|
||||
description: The condition of the MTGJSON SKU
|
||||
- name: language
|
||||
type: string
|
||||
description: The language of the MTGJSON SKU
|
||||
- name: printing
|
||||
type: string
|
||||
description: The printing of the MTGJSON SKU
|
||||
- name: finish
|
||||
type: string
|
||||
description: The finish of the MTGJSON SKU
|
||||
- name: productId
|
||||
type: string
|
||||
description: The tcgplayer product ID of the MTGJSON SKU
|
||||
- name: skuId
|
||||
type: string
|
||||
description: The tcgplayer SKU ID of the MTGJSON SKU
|
||||
- name: mtgjson_identifiers
|
||||
strategy: drop_and_recreate
|
||||
columns:
|
||||
- name: id
|
||||
type: integer
|
||||
description: internal database id
|
||||
primary_key: true
|
||||
autoincrement: true
|
||||
- name: uuid
|
||||
type: string
|
||||
description: The UUID of the MTGJSON Identifier
|
||||
- name: name
|
||||
type: string
|
||||
description: The name of the MTGJSON Identifier
|
||||
- name: setCode
|
||||
type: string
|
||||
description: The set code of the MTGJSON Identifier
|
||||
- name: abuId
|
||||
type: string
|
||||
description: The Abu Games ID
|
||||
- name: cardKingdomEtchedId
|
||||
type: string
|
||||
description: The Card Kingdom Etched ID
|
||||
- name: cardKingdomFoilId
|
||||
type: string
|
||||
description: The Card Kingdom Foil ID
|
||||
- name: cardKingdomId
|
||||
type: string
|
||||
description: The Card Kingdom ID
|
||||
- name: cardsphereId
|
||||
type: string
|
||||
description: The Cardsphere ID
|
||||
- name: cardsphereFoilId
|
||||
type: string
|
||||
description: The Cardsphere Foil ID
|
||||
- name: cardtraderId
|
||||
type: string
|
||||
description: The Cardtrader ID
|
||||
- name: csiId
|
||||
type: string
|
||||
description: The cool stuff inc ID
|
||||
- name: mcmId
|
||||
type: string
|
||||
description: The cardmarket ID
|
||||
- name: mcmMetaId
|
||||
type: string
|
||||
description: The cardmarket meta ID
|
||||
- name: miniaturemarketId
|
||||
type: string
|
||||
description: The miniaturemarket ID
|
||||
- name: mtgArenaId
|
||||
type: string
|
||||
description: The mtg arena ID
|
||||
- name: mtgjsonFoilVersionId
|
||||
type: string
|
||||
description: The uuid generated by mtgjson for the foil version of a card
|
||||
- name: mtgjsonNonFoilVersionId
|
||||
type: string
|
||||
description: The uuid generated by mtgjson for the non-foil version of a card
|
||||
- name: mtgjsonV4Id
|
||||
type: string
|
||||
description: The uuid generated by mtgjson a card
|
||||
- name: mtgoFoilId
|
||||
type: string
|
||||
description: The mtgo foil ID
|
||||
- name: mtgoId
|
||||
type: string
|
||||
description: The mtgo ID
|
||||
- name: multiverseId
|
||||
type: string
|
||||
description: The multiverse ID used by wotc for gatherer
|
||||
- name: scgId
|
||||
type: string
|
||||
description: The starcitygames ID
|
||||
- name: scryfallId
|
||||
type: string
|
||||
description: The scryfall ID
|
||||
- name: scryfallCardBackId
|
||||
type: string
|
||||
description: The scryfall card back ID
|
||||
- name: scryfallOracleId
|
||||
type: string
|
||||
description: The scryfall oracle ID
|
||||
- name: scryfallIllustrationId
|
||||
type: string
|
||||
description: The scryfall illustration ID
|
||||
- name: tcgplayerProductId
|
||||
type: string
|
||||
description: The tcgplayer product ID
|
||||
- name: tcgplayerEtchedProductId
|
||||
type: string
|
||||
description: The tcgplayer etched product ID
|
||||
- name: tntId
|
||||
type: string
|
||||
description: The troll and toad ID
|
||||
- name: tcgcsv_categories
|
||||
strategy: drop_and_recreate
|
||||
columns:
|
||||
- name: id
|
||||
type: integer
|
||||
description: internal database id
|
||||
primary_key: true
|
||||
autoincrement: true
|
||||
- name: categoryId
|
||||
type: integer
|
||||
- name: name
|
||||
type: string
|
||||
- name: modifiedOn
|
||||
type: string
|
||||
- name: displayName
|
||||
type: string
|
||||
- name: seoCategoryName
|
||||
type: string
|
||||
- name: categoryDescription
|
||||
type: string
|
||||
- name: categoryPageTitle
|
||||
type: string
|
||||
- name: sealedLabel
|
||||
type: string
|
||||
- name: nonSealedLabel
|
||||
type: string
|
||||
- name: conditionGuideUrl
|
||||
type: string
|
||||
- name: isScannable
|
||||
type: boolean
|
||||
- name: popularity
|
||||
type: integer
|
||||
- name: isDirect
|
||||
type: boolean
|
||||
- name: tcgcsv_groups
|
||||
strategy: drop_and_recreate
|
||||
columns:
|
||||
- name: id
|
||||
type: integer
|
||||
primary_key: true
|
||||
autoincrement: true
|
||||
- name: groupId
|
||||
type: integer
|
||||
- name: name
|
||||
type: string
|
||||
- name: abbreviation
|
||||
type: string
|
||||
- name: isSupplemental
|
||||
type: boolean
|
||||
- name: publishedOn
|
||||
type: string
|
||||
- name: modifiedOn
|
||||
type: string
|
||||
- name: categoryId
|
||||
type: integer
|
||||
is_secret: false
|
89
f/CCR_ETL/ccr_etl_db_init.py
Normal file
89
f/CCR_ETL/ccr_etl_db_init.py
Normal file
@@ -0,0 +1,89 @@
|
||||
import os
|
||||
import wmill
|
||||
import yaml
|
||||
from sqlalchemy import create_engine, text, MetaData, Table, Column, Integer, String, inspect
|
||||
from sqlalchemy.engine import Engine
|
||||
import psycopg2
|
||||
|
||||
# You can import any PyPi package.
|
||||
# See here for more info: https://www.windmill.dev/docs/advanced/dependencies_in_python
|
||||
|
||||
# you can use typed resources by doing a type alias to dict
|
||||
#postgresql = dict
|
||||
|
||||
DB_RESOURCE_PATH = 'u/joshuakrzemien/slick_postgresql'
|
||||
DB_CONFIG_PATH = 'f/CCR_ETL/ccr_db_config'
|
||||
|
||||
def create_db_engine(db: dict):
|
||||
db_url = f"postgresql+psycopg2://postgres:{db['password']}@{db['host']}:{db['port']}/{db['dbname']}"
|
||||
engine = create_engine(db_url)
|
||||
engine.connect()
|
||||
return engine
|
||||
|
||||
def table_exists(engine: Engine, table_name: str) -> bool:
|
||||
"""Check if a table exists in the database."""
|
||||
inspector = inspect(engine)
|
||||
return table_name in inspector.get_table_names()
|
||||
|
||||
def create_table(engine: Engine, table: dict, strategy: str = "create_if_not_exists"):
|
||||
try:
|
||||
table_name = table['name']
|
||||
columns = table['columns']
|
||||
|
||||
# Handle different table strategies
|
||||
if strategy == "drop_and_recreate":
|
||||
if table_exists(engine, table_name):
|
||||
print(f"Dropping existing table: {table_name}")
|
||||
with engine.connect() as conn:
|
||||
conn.execute(text(f"DROP TABLE IF EXISTS {table_name} CASCADE"))
|
||||
conn.commit()
|
||||
|
||||
elif strategy == "create_if_not_exists":
|
||||
if table_exists(engine, table_name):
|
||||
print(f"Table {table_name} already exists, skipping creation")
|
||||
return
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown table strategy: {strategy}")
|
||||
|
||||
# Map config types to SQLAlchemy types
|
||||
type_mapping = {
|
||||
'integer': Integer,
|
||||
'string': String
|
||||
}
|
||||
|
||||
# Build SQLAlchemy columns
|
||||
sqlalchemy_columns = []
|
||||
for column in columns:
|
||||
col_type = type_mapping.get(column['type'], String)
|
||||
sqlalchemy_columns.append(Column(column['name'], col_type, primary_key=column.get('primary_key', False), nullable=column.get('nullable', True), index=column.get('index', False), autoincrement=column.get('autoincrement', False)))
|
||||
|
||||
# Create table using SQLAlchemy Core
|
||||
metadata = MetaData()
|
||||
new_table = Table(table_name, metadata, *sqlalchemy_columns)
|
||||
|
||||
# Create the table
|
||||
metadata.create_all(engine)
|
||||
|
||||
print(f"Successfully created table: {table_name}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error creating table {table_name}: {str(e)}")
|
||||
raise
|
||||
|
||||
def main():
|
||||
db = wmill.client.get_resource(DB_RESOURCE_PATH)
|
||||
config_yaml = wmill.get_variable(DB_CONFIG_PATH)
|
||||
config = yaml.safe_load(config_yaml)
|
||||
engine = create_db_engine(db)
|
||||
|
||||
# Get table strategy from config (default to drop_and_recreate)
|
||||
table_strategy = config.get('table_strategy', 'drop_and_recreate')
|
||||
print(f"Using table strategy: {table_strategy}")
|
||||
|
||||
for table in config['schema']['tables']:
|
||||
# Allow per-table strategy override
|
||||
table_specific_strategy = table.get('strategy', table_strategy)
|
||||
create_table(engine, table, table_specific_strategy)
|
||||
|
||||
return {"status": "success"}
|
14
f/CCR_ETL/ccr_etl_db_init.script.lock
Normal file
14
f/CCR_ETL/ccr_etl_db_init.script.lock
Normal file
@@ -0,0 +1,14 @@
|
||||
# py: 3.11
|
||||
anyio==4.10.0
|
||||
certifi==2025.8.3
|
||||
greenlet==3.2.4
|
||||
h11==0.16.0
|
||||
httpcore==1.0.9
|
||||
httpx==0.28.1
|
||||
idna==3.10
|
||||
psycopg2-binary==2.9.10
|
||||
pyyaml==6.0.2
|
||||
sniffio==1.3.1
|
||||
sqlalchemy==2.0.43
|
||||
typing-extensions==4.15.0
|
||||
wmill==1.538.0
|
9
f/CCR_ETL/ccr_etl_db_init.script.yaml
Normal file
9
f/CCR_ETL/ccr_etl_db_init.script.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
summary: ''
|
||||
description: ''
|
||||
lock: '!inline f/CCR_ETL/ccr_etl_db_init.script.lock'
|
||||
kind: script
|
||||
schema:
|
||||
$schema: 'https://json-schema.org/draft/2020-12/schema'
|
||||
type: object
|
||||
properties: {}
|
||||
required: []
|
377
f/CCR_ETL/ccr_etl_mtgjson.py
Normal file
377
f/CCR_ETL/ccr_etl_mtgjson.py
Normal file
@@ -0,0 +1,377 @@
|
||||
"""
|
||||
CCR ETL MTGJSON Processing Script
|
||||
|
||||
This script handles the extraction, transformation, and loading of MTGJSON data
|
||||
into a PostgreSQL database. It supports downloading, unzipping, preprocessing,
|
||||
and batch inserting of various data formats.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import yaml
|
||||
from typing import Union
|
||||
from zipfile import ZipFile
|
||||
|
||||
import psycopg2
|
||||
import requests
|
||||
import wmill
|
||||
from sqlalchemy import create_engine, text
|
||||
from sqlalchemy.engine import Engine
|
||||
|
||||
# Configuration paths
|
||||
DB_RESOURCE_PATH = 'u/joshuakrzemien/slick_postgresql'
|
||||
DB_CONFIG_PATH = 'f/CCR_ETL/ccr_db_config'
|
||||
EXTRACT_CONFIG_PATH = 'f/CCR_ETL/ccr_extract_config'
|
||||
DOWNLOAD_CONFIG_PATH = './shared/'
|
||||
|
||||
# Default processing parameters
|
||||
DEFAULT_BATCH_SIZE = 1000
|
||||
|
||||
def validate_response_headers(response: requests.Response, outer_file_type: str) -> None:
|
||||
"""Validate that the response content type matches the expected file type."""
|
||||
if response.headers['Content-Type'] != f'application/{outer_file_type}':
|
||||
raise ValueError(f"Expected {outer_file_type} response, got {response.headers['Content-Type']}")
|
||||
|
||||
|
||||
def download_request(url: str, filename: str, outer_file_type: str) -> bytes:
|
||||
"""Download a file from the given URL and validate its content type."""
|
||||
print(f"🔽 Downloading {filename} from {url}")
|
||||
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
validate_response_headers(response, outer_file_type)
|
||||
|
||||
print(f"✅ Download successful ({response.headers.get('Content-Length', 'Unknown')} bytes)")
|
||||
return response.content
|
||||
|
||||
def generate_download_queue(url: str, filename: str, outer_file_type: str, iterables: dict) -> list:
|
||||
"""
|
||||
Generate a queue of download items based on URL templates and iterable values.
|
||||
|
||||
Example:
|
||||
url = "https://tcgcsv.com/tcgplayer/{game_id}/groups"
|
||||
iterables = {'game_id': [1,3,65,71,86]}
|
||||
"""
|
||||
queue = []
|
||||
for key, value in iterables.items():
|
||||
for item in value:
|
||||
queue_item = {
|
||||
'url': url.format(key=key, value=item),
|
||||
'filename': filename.format(key=key, value=item),
|
||||
'outer_file_type': outer_file_type,
|
||||
}
|
||||
queue.append(queue_item)
|
||||
return queue
|
||||
|
||||
|
||||
def save_file(content: bytes, filename: str) -> None:
|
||||
"""Save binary content to a file in the download directory."""
|
||||
filepath = DOWNLOAD_CONFIG_PATH + filename
|
||||
with open(filepath, 'wb') as f:
|
||||
f.write(content)
|
||||
print(f"💾 Saved {len(content)} bytes to {filename}")
|
||||
|
||||
|
||||
def unzip_file(filename: str) -> str:
|
||||
"""Extract a zip file and return the name of the extracted content."""
|
||||
new_filename = filename.replace('.zip', '')
|
||||
zip_path = DOWNLOAD_CONFIG_PATH + filename
|
||||
|
||||
with ZipFile(zip_path, 'r') as zip_ref:
|
||||
file_list = zip_ref.namelist()
|
||||
print(f"📦 Extracting {len(file_list)} files from {filename}")
|
||||
zip_ref.extractall(DOWNLOAD_CONFIG_PATH)
|
||||
|
||||
return new_filename
|
||||
|
||||
def load_file(filename: str, file_type: str) -> Union[dict, list]:
|
||||
"""Load and parse a file from the download directory."""
|
||||
filepath = DOWNLOAD_CONFIG_PATH + filename
|
||||
|
||||
if file_type == 'json':
|
||||
with open(filepath, 'r') as f:
|
||||
data = json.load(f)
|
||||
print(f"📖 Loaded {file_type} file: {filename}")
|
||||
return data
|
||||
else:
|
||||
raise ValueError(f"Unsupported file type: {file_type}")
|
||||
|
||||
def build_record_from_config(source_data: dict, expected_columns: list, additional_data: dict = None) -> dict:
|
||||
"""
|
||||
Build a record using the structure defined in the extract config.
|
||||
|
||||
Args:
|
||||
source_data: The source data dictionary
|
||||
expected_columns: List of column definitions from config
|
||||
additional_data: Optional additional data to merge (e.g., parent UUID)
|
||||
|
||||
Returns:
|
||||
Dictionary representing a single database record
|
||||
"""
|
||||
if additional_data is None:
|
||||
additional_data = {}
|
||||
|
||||
# Merge source data with additional data (like uuid from parent structure)
|
||||
combined_data = {**source_data, **additional_data}
|
||||
|
||||
record = {}
|
||||
for column in expected_columns:
|
||||
col_name = column['name']
|
||||
# Skip auto-increment columns (like 'id')
|
||||
if column.get('auto_increment', False):
|
||||
continue
|
||||
# Get value from combined data, use empty string as default
|
||||
record[col_name] = combined_data.get(col_name, '')
|
||||
|
||||
return record
|
||||
|
||||
def create_db_engine(db: dict) -> Engine:
|
||||
"""Create and test a database engine connection."""
|
||||
db_url = f"postgresql+psycopg2://postgres:{db['password']}@{db['host']}:{db['port']}/{db['dbname']}"
|
||||
engine = create_engine(db_url)
|
||||
|
||||
# Test connection
|
||||
conn = engine.connect()
|
||||
conn.close()
|
||||
print(f"🔌 Connected to database: {db['host']}:{db['port']}/{db['dbname']}")
|
||||
|
||||
return engine
|
||||
|
||||
|
||||
def get_db_engine() -> Engine:
|
||||
"""Get a database engine using the configured resource."""
|
||||
db = wmill.client.get_resource(DB_RESOURCE_PATH)
|
||||
return create_db_engine(db)
|
||||
|
||||
def generic_preprocess(
|
||||
data: Union[dict, list],
|
||||
expected_columns: list,
|
||||
config: dict
|
||||
) -> list:
|
||||
"""
|
||||
Generic data preprocessing function that handles various data structures.
|
||||
|
||||
Args:
|
||||
data: Source data (dict or list)
|
||||
expected_columns: List of column definitions
|
||||
config: Preprocessing configuration
|
||||
|
||||
Returns:
|
||||
List of processed records
|
||||
"""
|
||||
# Step 1: Follow data path
|
||||
data_path = config.get("data_path", [])
|
||||
for key in data_path:
|
||||
if not isinstance(data, dict):
|
||||
raise ValueError(f"Expected dict while navigating path, got {type(data)} at key '{key}'")
|
||||
data = data.get(key)
|
||||
if data is None:
|
||||
raise ValueError(f"Missing key '{key}' in data path: {data_path}")
|
||||
|
||||
# Step 2: Handle nested structure
|
||||
nested = config.get("nested", False)
|
||||
nested_key = config.get("nested_key", None)
|
||||
id_key = config.get("id_key", None)
|
||||
flatten = config.get("flatten", False)
|
||||
|
||||
records = []
|
||||
|
||||
if isinstance(data, dict):
|
||||
items = data.items()
|
||||
elif isinstance(data, list):
|
||||
items = enumerate(data)
|
||||
else:
|
||||
raise ValueError(f"Unsupported data structure: {type(data)}")
|
||||
|
||||
for outer_key, outer_value in items:
|
||||
if nested:
|
||||
if not isinstance(outer_value, list):
|
||||
continue
|
||||
for inner_value in outer_value:
|
||||
if id_key and not inner_value.get(id_key):
|
||||
continue
|
||||
additional_data = {nested_key: outer_key} if nested_key else {}
|
||||
record = build_record_from_config(inner_value, expected_columns, additional_data)
|
||||
records.append(record)
|
||||
else:
|
||||
if not isinstance(outer_value, dict):
|
||||
continue
|
||||
if id_key and not outer_value.get(id_key):
|
||||
continue
|
||||
if flatten:
|
||||
nested_data = outer_value.get("identifiers", {})
|
||||
combined = {**nested_data, "uuid": outer_value.get("uuid")}
|
||||
record = build_record_from_config(combined, expected_columns)
|
||||
else:
|
||||
record = build_record_from_config(outer_value, expected_columns)
|
||||
records.append(record)
|
||||
|
||||
print(f"🔄 Processed {len(records)} records")
|
||||
return records
|
||||
|
||||
def control_batch(data: list, batch_size: int = DEFAULT_BATCH_SIZE):
|
||||
"""Split data into batches for processing."""
|
||||
for i in range(0, len(data), batch_size):
|
||||
yield data[i:i+batch_size]
|
||||
|
||||
|
||||
def insert_data_into_table_batch(records: list, table: str, engine: Engine, batch_size: int = DEFAULT_BATCH_SIZE) -> None:
|
||||
"""Insert records into database table in batches."""
|
||||
if not records:
|
||||
print("⚠️ No records to insert, skipping database operation")
|
||||
return
|
||||
|
||||
print(f"💾 Inserting {len(records)} records into {table} (batch size: {batch_size})")
|
||||
|
||||
# Get column names from first record
|
||||
columns = list(records[0].keys())
|
||||
column_names = ', '.join(f'"{col}"' for col in columns)
|
||||
placeholders = ', '.join([f':{col}' for col in columns])
|
||||
|
||||
insert_sql = f"INSERT INTO {table} ({column_names}) VALUES ({placeholders})"
|
||||
|
||||
with engine.connect() as conn:
|
||||
batch_count = 0
|
||||
total_inserted = 0
|
||||
|
||||
for batch in control_batch(records, batch_size):
|
||||
batch_count += 1
|
||||
batch_size_actual = len(batch)
|
||||
|
||||
conn.execute(text(insert_sql), batch)
|
||||
total_inserted += batch_size_actual
|
||||
|
||||
if batch_count % 10 == 0:
|
||||
print(f"⏳ Inserted {total_inserted}/{len(records)} records...")
|
||||
|
||||
conn.commit()
|
||||
print(f"✅ Inserted {total_inserted} records in {batch_count} batches")
|
||||
|
||||
|
||||
def process_job(job: dict) -> dict:
|
||||
"""
|
||||
Process a single ETL job.
|
||||
|
||||
Args:
|
||||
job: Job configuration dictionary
|
||||
|
||||
Returns:
|
||||
Dictionary with job processing results
|
||||
"""
|
||||
# Extract job parameters
|
||||
url = job.get('url')
|
||||
filename = job.get('filename')
|
||||
outer_file_type = job.get('outer_file_type')
|
||||
inner_file_type = job.get('inner_file_type')
|
||||
table = job.get('table')
|
||||
expected_columns = job.get('expected_columns')
|
||||
batch_size = job.get('batch_size', DEFAULT_BATCH_SIZE)
|
||||
preprocess_function_name = job.get('preprocess_function', 'generic_preprocess')
|
||||
preprocess_config = job.get('preprocess_config')
|
||||
active = job.get('active')
|
||||
iterables = job.get('iterables')
|
||||
|
||||
print(f"\n🚀 Processing job for table '{table}'")
|
||||
|
||||
if not active:
|
||||
print(f"⚠️ Job is not active, skipping")
|
||||
return {"status": "skipped"}
|
||||
|
||||
# Get preprocessing function
|
||||
if isinstance(preprocess_function_name, str):
|
||||
preprocess_function = globals().get(preprocess_function_name)
|
||||
if not callable(preprocess_function):
|
||||
raise ValueError(f"Preprocessing function '{preprocess_function_name}' not found or not callable.")
|
||||
|
||||
# Get database engine
|
||||
engine = get_db_engine()
|
||||
|
||||
# Populate download queue
|
||||
if iterables:
|
||||
queue = generate_download_queue(url, filename, outer_file_type, iterables)
|
||||
else:
|
||||
queue = [{
|
||||
'url': url,
|
||||
'filename': filename,
|
||||
'outer_file_type': outer_file_type,
|
||||
'inner_file_type': inner_file_type,
|
||||
'table': table,
|
||||
'expected_columns': expected_columns
|
||||
}]
|
||||
|
||||
# Process download queue
|
||||
for queue_item in queue:
|
||||
content = download_request(queue_item.get('url'), queue_item.get('filename'), queue_item.get('outer_file_type'))
|
||||
save_file(content, queue_item.get('filename'))
|
||||
|
||||
# Handle file extraction if needed
|
||||
saved_filename = filename
|
||||
if outer_file_type == 'zip':
|
||||
saved_filename = unzip_file(filename)
|
||||
|
||||
# Load and preprocess data
|
||||
data = load_file(saved_filename, inner_file_type)
|
||||
records = preprocess_function(data, expected_columns, preprocess_config)
|
||||
|
||||
# Insert data into database
|
||||
insert_data_into_table_batch(records, table, engine, batch_size)
|
||||
|
||||
result = {
|
||||
"status": "success",
|
||||
"table": table,
|
||||
"records_processed": len(records),
|
||||
"filename": saved_filename
|
||||
}
|
||||
|
||||
print(f"✅ Job complete: {len(records)} records processed for {table}")
|
||||
return result
|
||||
|
||||
|
||||
|
||||
def main() -> dict:
|
||||
"""
|
||||
Main ETL processing function.
|
||||
|
||||
Returns:
|
||||
Dictionary with overall processing results
|
||||
"""
|
||||
print("🎯 ETL Process Starting")
|
||||
print("=" * 50)
|
||||
|
||||
# Load configuration
|
||||
config_yaml = wmill.get_variable(EXTRACT_CONFIG_PATH)
|
||||
config = yaml.safe_load(config_yaml)
|
||||
print(f"📋 Processing {len(config['jobs'])} jobs")
|
||||
|
||||
results = []
|
||||
successful_jobs = 0
|
||||
failed_jobs = 0
|
||||
|
||||
for i, job in enumerate(config['jobs'], 1):
|
||||
print(f"\n--- Job {i}/{len(config['jobs'])} ---")
|
||||
|
||||
try:
|
||||
result = process_job(job)
|
||||
results.append(result)
|
||||
successful_jobs += 1
|
||||
except Exception as e:
|
||||
error_result = {
|
||||
"status": "error",
|
||||
"table": job.get('table', 'unknown'),
|
||||
"error": str(e),
|
||||
"filename": job.get('filename', 'unknown')
|
||||
}
|
||||
results.append(error_result)
|
||||
failed_jobs += 1
|
||||
print(f"❌ Job {i} failed: {str(e)}")
|
||||
|
||||
print(f"\n🏁 ETL Process Complete")
|
||||
print(f"✅ Successful: {successful_jobs} | ❌ Failed: {failed_jobs} | 📋 Total: {len(results)}")
|
||||
|
||||
return {
|
||||
"status": "completed",
|
||||
"jobs_processed": len(results),
|
||||
"successful_jobs": successful_jobs,
|
||||
"failed_jobs": failed_jobs,
|
||||
"results": results
|
||||
}
|
17
f/CCR_ETL/ccr_etl_mtgjson.script.lock
Normal file
17
f/CCR_ETL/ccr_etl_mtgjson.script.lock
Normal file
@@ -0,0 +1,17 @@
|
||||
# py: 3.11
|
||||
anyio==4.10.0
|
||||
certifi==2025.8.3
|
||||
charset-normalizer==3.4.3
|
||||
greenlet==3.2.4
|
||||
h11==0.16.0
|
||||
httpcore==1.0.9
|
||||
httpx==0.28.1
|
||||
idna==3.10
|
||||
psycopg2-binary==2.9.10
|
||||
pyyaml==6.0.2
|
||||
requests==2.32.5
|
||||
sniffio==1.3.1
|
||||
sqlalchemy==2.0.43
|
||||
typing-extensions==4.15.0
|
||||
urllib3==2.5.0
|
||||
wmill==1.539.1
|
9
f/CCR_ETL/ccr_etl_mtgjson.script.yaml
Normal file
9
f/CCR_ETL/ccr_etl_mtgjson.script.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
summary: CCR ETL MTGJSON
|
||||
description: ''
|
||||
lock: '!inline f/CCR_ETL/ccr_etl_mtgjson.script.lock'
|
||||
kind: script
|
||||
schema:
|
||||
$schema: 'https://json-schema.org/draft/2020-12/schema'
|
||||
type: object
|
||||
properties: {}
|
||||
required: []
|
162
f/CCR_ETL/ccr_extract_config.variable.yaml
Normal file
162
f/CCR_ETL/ccr_extract_config.variable.yaml
Normal file
@@ -0,0 +1,162 @@
|
||||
description: ''
|
||||
value: |-
|
||||
jobs:
|
||||
- name: mtgjson_skus
|
||||
active: true
|
||||
url: https://mtgjson.com/api/v5/TcgplayerSkus.json.zip
|
||||
filename: TcgplayerSkus.json.zip
|
||||
outer_file_type: zip
|
||||
inner_file_type: json
|
||||
preprocess_config:
|
||||
data_path: ["data"]
|
||||
nested: true
|
||||
nested_key: "uuid"
|
||||
id_key: "skuId"
|
||||
table: mtgjson_skus
|
||||
batch_size: 1000
|
||||
expected_columns:
|
||||
- name: uuid
|
||||
type: string
|
||||
- name: condition
|
||||
type: string
|
||||
- name: language
|
||||
type: string
|
||||
- name: printing
|
||||
type: string
|
||||
- name: finish
|
||||
type: string
|
||||
- name: productId
|
||||
type: string
|
||||
- name: skuId
|
||||
type: string
|
||||
cache:
|
||||
status: true
|
||||
ttl: 86400
|
||||
- name: mtgjson_identifiers
|
||||
active: true
|
||||
url: https://mtgjson.com/api/v5/AllIdentifiers.json.zip
|
||||
filename: AllIdentifiers.json.zip
|
||||
outer_file_type: zip
|
||||
inner_file_type: json
|
||||
preprocess_config:
|
||||
data_path: ["data"]
|
||||
nested: false
|
||||
flatten: true
|
||||
id_key: "uuid"
|
||||
table: mtgjson_identifiers
|
||||
batch_size: 1000
|
||||
expected_columns:
|
||||
- name: uuid
|
||||
type: string
|
||||
- name: name
|
||||
type: string
|
||||
- name: setCode
|
||||
type: string
|
||||
- name: abuId
|
||||
type: string
|
||||
- name: cardKingdomEtchedId
|
||||
type: string
|
||||
- name: cardKingdomFoilId
|
||||
type: string
|
||||
- name: cardKingdomId
|
||||
type: string
|
||||
- name: cardsphereId
|
||||
type: string
|
||||
- name: cardsphereFoilId
|
||||
type: string
|
||||
- name: cardtraderId
|
||||
type: string
|
||||
- name: csiId
|
||||
type: string
|
||||
- name: mcmId
|
||||
type: string
|
||||
- name: mcmMetaId
|
||||
type: string
|
||||
- name: miniaturemarketId
|
||||
type: string
|
||||
- name: mtgArenaId
|
||||
type: string
|
||||
- name: mtgjsonFoilVersionId
|
||||
type: string
|
||||
- name: mtgjsonNonFoilVersionId
|
||||
type: string
|
||||
- name: mtgjsonV4Id
|
||||
type: string
|
||||
- name: mtgoFoilId
|
||||
type: string
|
||||
- name: mtgoId
|
||||
type: string
|
||||
- name: multiverseId
|
||||
type: string
|
||||
- name: scgId
|
||||
type: string
|
||||
- name: scryfallId
|
||||
type: string
|
||||
- name: scryfallCardBackId
|
||||
type: string
|
||||
- name: scryfallOracleId
|
||||
type: string
|
||||
- name: scryfallIllustrationId
|
||||
type: string
|
||||
- name: tcgplayerProductId
|
||||
type: string
|
||||
- name: tcgplayerEtchedProductId
|
||||
type: string
|
||||
- name: tntId
|
||||
type: string
|
||||
cache:
|
||||
status: true
|
||||
ttl: 86400
|
||||
- name: tcgcsv_categories
|
||||
active: true
|
||||
url: https://tcgcsv.com/tcgplayer/categories
|
||||
outer_file_type: json
|
||||
preprocess_config:
|
||||
data_path: ["results"]
|
||||
nested: false
|
||||
filename: tcgplayer_categories.json
|
||||
expected_columns:
|
||||
- name: categoryId
|
||||
type: integer
|
||||
- name: name
|
||||
type: string
|
||||
- name: modifiedOn
|
||||
type: string
|
||||
- name: displayName
|
||||
type: string
|
||||
- name: seoCategoryName
|
||||
type: string
|
||||
- name: categoryDescription
|
||||
type: string
|
||||
- name: categoryPageTitle
|
||||
type: string
|
||||
- name: sealedLabel
|
||||
type: string
|
||||
- name: nonSealedLabel
|
||||
type: string
|
||||
- name: conditionGuideUrl
|
||||
type: string
|
||||
- name: isScannable
|
||||
type: boolean
|
||||
- name: popularity
|
||||
type: integer
|
||||
- name: isDirect
|
||||
type: boolean
|
||||
- name: tcgcsv_groups
|
||||
active: true
|
||||
url: https://tcgcsv.com/tcgplayer/{game_id}/groups
|
||||
outer_file_type: json
|
||||
preprocess_config:
|
||||
data_path: ["results"]
|
||||
nested: false
|
||||
filename: tcgplayer_{game_id}_groups.json
|
||||
expected_columns:
|
||||
- name: groupId
|
||||
type: integer
|
||||
- name: name
|
||||
type: string
|
||||
- name: modifiedOn
|
||||
type: string
|
||||
iterables:
|
||||
game_id: [1,3,65,71,86]
|
||||
is_secret: false
|
6
f/CCR_ETL/folder.meta.yaml
Normal file
6
f/CCR_ETL/folder.meta.yaml
Normal file
@@ -0,0 +1,6 @@
|
||||
summary: null
|
||||
display_name: CCR_ETL
|
||||
extra_perms:
|
||||
u/joshuakrzemien: true
|
||||
owners:
|
||||
- u/joshuakrzemien
|
Reference in New Issue
Block a user