2025-04-12 23:51:17 -04:00

313 lines
16 KiB
Python

import os
import json
import zipfile
import aiohttp
import asyncio
import time
import sys
from typing import Dict, Any, Optional, Generator
from sqlalchemy.orm import Session
from datetime import datetime
from app.models.mtgjson_card import MTGJSONCard
from app.models.mtgjson_sku import MTGJSONSKU
class MTGJSONService:
def __init__(self, cache_dir: str = "app/data/cache/mtgjson", batch_size: int = 1000):
self.cache_dir = cache_dir
self.identifiers_dir = os.path.join(cache_dir, "identifiers")
self.skus_dir = os.path.join(cache_dir, "skus")
self.batch_size = batch_size
# Create necessary directories
os.makedirs(cache_dir, exist_ok=True)
os.makedirs(self.identifiers_dir, exist_ok=True)
os.makedirs(self.skus_dir, exist_ok=True)
def _format_progress(self, current: int, total: int, start_time: float) -> str:
"""Format a progress message with percentage and timing information"""
elapsed = time.time() - start_time
if total > 0:
percent = (current / total) * 100
items_per_second = current / elapsed if elapsed > 0 else 0
eta = (total - current) / items_per_second if items_per_second > 0 else 0
return f"[{current}/{total} ({percent:.1f}%)] {items_per_second:.1f} items/sec, ETA: {eta:.1f}s"
return f"[{current} items] {current/elapsed:.1f} items/sec"
def _print_progress(self, message: str, end: str = "\n") -> None:
"""Print progress message with flush"""
print(message, end=end, flush=True)
async def _download_file(self, url: str, output_path: str) -> None:
"""Download a file from the given URL to the specified path using streaming"""
print(f"Downloading {url}...")
start_time = time.time()
total_size = 0
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
if response.status == 200:
total_size = int(response.headers.get('content-length', 0))
with open(output_path, 'wb') as f:
downloaded = 0
async for chunk in response.content.iter_chunked(8192):
f.write(chunk)
downloaded += len(chunk)
if total_size > 0:
percent = (downloaded / total_size) * 100
elapsed = time.time() - start_time
speed = downloaded / elapsed / 1024 / 1024 # MB/s
print(f"\rDownloading: {percent:.1f}% ({downloaded/1024/1024:.1f}MB/{total_size/1024/1024:.1f}MB) at {speed:.1f}MB/s", end="")
print("\nDownload complete!")
else:
raise Exception(f"Failed to download file from {url}. Status: {response.status}")
async def _unzip_file(self, zip_path: str, extract_dir: str) -> str:
"""Unzip a file to the specified directory and return the path to the extracted JSON file"""
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
json_filename = zip_ref.namelist()[0]
zip_ref.extractall(extract_dir)
return os.path.join(extract_dir, json_filename)
def _stream_json_file(self, file_path: str) -> Generator[Dict[str, Any], None, None]:
"""Stream a JSON file and yield items one at a time"""
print(f"Starting to stream JSON file: {file_path}")
with open(file_path, 'r') as f:
# Load the entire file since MTGJSON uses a specific format
data = json.load(f)
# First yield the meta data
if "meta" in data:
yield {"type": "meta", "data": data["meta"]}
# Then yield each item in the data section
if "data" in data:
for key, value in data["data"].items():
yield {"type": "item", "data": {key: value}}
async def _process_batch(self, db: Session, items: list, model_class, commit: bool = True) -> int:
"""Process a batch of items and add them to the database"""
processed = 0
for item in items:
if model_class == MTGJSONCard:
# Check if card already exists
existing_card = db.query(MTGJSONCard).filter(MTGJSONCard.card_id == item["card_id"]).first()
if existing_card:
continue
new_item = MTGJSONCard(
card_id=item["card_id"],
name=item["name"],
set_code=item["set_code"],
uuid=item["uuid"],
abu_id=item.get("abu_id"),
card_kingdom_etched_id=item.get("card_kingdom_etched_id"),
card_kingdom_foil_id=item.get("card_kingdom_foil_id"),
card_kingdom_id=item.get("card_kingdom_id"),
cardsphere_id=item.get("cardsphere_id"),
cardsphere_foil_id=item.get("cardsphere_foil_id"),
cardtrader_id=item.get("cardtrader_id"),
csi_id=item.get("csi_id"),
mcm_id=item.get("mcm_id"),
mcm_meta_id=item.get("mcm_meta_id"),
miniaturemarket_id=item.get("miniaturemarket_id"),
mtg_arena_id=item.get("mtg_arena_id"),
mtgjson_foil_version_id=item.get("mtgjson_foil_version_id"),
mtgjson_non_foil_version_id=item.get("mtgjson_non_foil_version_id"),
mtgjson_v4_id=item.get("mtgjson_v4_id"),
mtgo_foil_id=item.get("mtgo_foil_id"),
mtgo_id=item.get("mtgo_id"),
multiverse_id=item.get("multiverse_id"),
scg_id=item.get("scg_id"),
scryfall_id=item.get("scryfall_id"),
scryfall_card_back_id=item.get("scryfall_card_back_id"),
scryfall_oracle_id=item.get("scryfall_oracle_id"),
scryfall_illustration_id=item.get("scryfall_illustration_id"),
tcgplayer_product_id=item.get("tcgplayer_product_id"),
tcgplayer_etched_product_id=item.get("tcgplayer_etched_product_id"),
tnt_id=item.get("tnt_id")
)
else: # MTGJSONSKU
# Check if SKU already exists
existing_sku = db.query(MTGJSONSKU).filter(MTGJSONSKU.sku_id == item["sku_id"]).first()
if existing_sku:
continue
new_item = MTGJSONSKU(
sku_id=str(item["sku_id"]),
product_id=str(item["product_id"]),
condition=item["condition"],
finish=item["finish"],
language=item["language"],
printing=item["printing"],
card_id=item["card_id"]
)
db.add(new_item)
processed += 1
if commit:
try:
db.commit()
except Exception as e:
db.rollback()
raise e
return processed
async def download_and_process_identifiers(self, db: Session) -> Dict[str, int]:
"""Download, unzip and process AllIdentifiers.json.zip using streaming"""
self._print_progress("Starting MTGJSON identifiers processing...")
start_time = time.time()
zip_path = os.path.join(self.identifiers_dir, "AllIdentifiers.json.zip")
await self._download_file(
"https://mtgjson.com/api/v5/AllIdentifiers.json.zip",
zip_path
)
self._print_progress("Unzipping file...")
json_path = await self._unzip_file(zip_path, self.identifiers_dir)
cards_processed = 0
current_batch = []
total_cards = 0
last_progress_time = time.time()
self._print_progress("Processing cards...")
try:
for item in self._stream_json_file(json_path):
if item["type"] == "meta":
self._print_progress(f"Processing MTGJSON data version {item['data'].get('version')} from {item['data'].get('date')}")
continue
card_data = item["data"]
card_id = list(card_data.keys())[0]
card_info = card_data[card_id]
total_cards += 1
current_batch.append({
"card_id": card_id,
"name": card_info.get("name"),
"set_code": card_info.get("setCode"),
"uuid": card_info.get("uuid"),
"abu_id": card_info.get("identifiers", {}).get("abuId"),
"card_kingdom_etched_id": card_info.get("identifiers", {}).get("cardKingdomEtchedId"),
"card_kingdom_foil_id": card_info.get("identifiers", {}).get("cardKingdomFoilId"),
"card_kingdom_id": card_info.get("identifiers", {}).get("cardKingdomId"),
"cardsphere_id": card_info.get("identifiers", {}).get("cardsphereId"),
"cardsphere_foil_id": card_info.get("identifiers", {}).get("cardsphereFoilId"),
"cardtrader_id": card_info.get("identifiers", {}).get("cardtraderId"),
"csi_id": card_info.get("identifiers", {}).get("csiId"),
"mcm_id": card_info.get("identifiers", {}).get("mcmId"),
"mcm_meta_id": card_info.get("identifiers", {}).get("mcmMetaId"),
"miniaturemarket_id": card_info.get("identifiers", {}).get("miniaturemarketId"),
"mtg_arena_id": card_info.get("identifiers", {}).get("mtgArenaId"),
"mtgjson_foil_version_id": card_info.get("identifiers", {}).get("mtgjsonFoilVersionId"),
"mtgjson_non_foil_version_id": card_info.get("identifiers", {}).get("mtgjsonNonFoilVersionId"),
"mtgjson_v4_id": card_info.get("identifiers", {}).get("mtgjsonV4Id"),
"mtgo_foil_id": card_info.get("identifiers", {}).get("mtgoFoilId"),
"mtgo_id": card_info.get("identifiers", {}).get("mtgoId"),
"multiverse_id": card_info.get("identifiers", {}).get("multiverseId"),
"scg_id": card_info.get("identifiers", {}).get("scgId"),
"scryfall_id": card_info.get("identifiers", {}).get("scryfallId"),
"scryfall_card_back_id": card_info.get("identifiers", {}).get("scryfallCardBackId"),
"scryfall_oracle_id": card_info.get("identifiers", {}).get("scryfallOracleId"),
"scryfall_illustration_id": card_info.get("identifiers", {}).get("scryfallIllustrationId"),
"tcgplayer_product_id": card_info.get("identifiers", {}).get("tcgplayerProductId"),
"tcgplayer_etched_product_id": card_info.get("identifiers", {}).get("tcgplayerEtchedProductId"),
"tnt_id": card_info.get("identifiers", {}).get("tntId"),
"data": card_info
})
if len(current_batch) >= self.batch_size:
batch_processed = await self._process_batch(db, current_batch, MTGJSONCard)
cards_processed += batch_processed
current_batch = []
current_time = time.time()
if current_time - last_progress_time >= 1.0: # Update progress every second
self._print_progress(f"\r{self._format_progress(cards_processed, total_cards, start_time)}", end="")
last_progress_time = current_time
except Exception as e:
self._print_progress(f"\nError during processing: {str(e)}")
raise
# Process remaining items
if current_batch:
batch_processed = await self._process_batch(db, current_batch, MTGJSONCard)
cards_processed += batch_processed
total_time = time.time() - start_time
self._print_progress(f"\nProcessing complete! Processed {cards_processed} cards in {total_time:.1f} seconds")
return {"cards_processed": cards_processed}
async def download_and_process_skus(self, db: Session) -> Dict[str, int]:
"""Download, unzip and process TcgplayerSkus.json.zip using streaming"""
self._print_progress("Starting MTGJSON SKUs processing...")
start_time = time.time()
zip_path = os.path.join(self.skus_dir, "TcgplayerSkus.json.zip")
await self._download_file(
"https://mtgjson.com/api/v5/TcgplayerSkus.json.zip",
zip_path
)
self._print_progress("Unzipping file...")
json_path = await self._unzip_file(zip_path, self.skus_dir)
skus_processed = 0
current_batch = []
total_skus = 0
last_progress_time = time.time()
self._print_progress("Processing SKUs...")
try:
for item in self._stream_json_file(json_path):
if item["type"] == "meta":
self._print_progress(f"Processing MTGJSON SKUs version {item['data'].get('version')} from {item['data'].get('date')}")
continue
# The data structure is {card_uuid: [sku1, sku2, ...]}
for card_uuid, sku_list in item["data"].items():
for sku in sku_list:
total_skus += 1
current_batch.append({
"sku_id": str(sku.get("skuId")),
"product_id": str(sku.get("productId")),
"condition": sku.get("condition"),
"finish": sku.get("finish", "NORMAL"), # Default to NORMAL if not specified
"language": sku.get("language"),
"printing": sku.get("printing"),
"card_id": card_uuid,
"data": sku
})
if len(current_batch) >= self.batch_size:
batch_processed = await self._process_batch(db, current_batch, MTGJSONSKU)
skus_processed += batch_processed
current_batch = []
current_time = time.time()
if current_time - last_progress_time >= 1.0: # Update progress every second
self._print_progress(f"\r{self._format_progress(skus_processed, total_skus, start_time)}", end="")
last_progress_time = current_time
except Exception as e:
self._print_progress(f"\nError during processing: {str(e)}")
raise
# Process remaining items
if current_batch:
batch_processed = await self._process_batch(db, current_batch, MTGJSONSKU)
skus_processed += batch_processed
total_time = time.time() - start_time
self._print_progress(f"\nProcessing complete! Processed {skus_processed} SKUs in {total_time:.1f} seconds")
return {"skus_processed": skus_processed}
async def clear_cache(self) -> None:
"""Clear all cached data"""
for subdir in ["identifiers", "skus"]:
dir_path = os.path.join(self.cache_dir, subdir)
if os.path.exists(dir_path):
for filename in os.listdir(dir_path):
file_path = os.path.join(dir_path, filename)
if os.path.isfile(file_path):
os.unlink(file_path)
print("MTGJSON cache cleared")