data init idk other stuff

This commit is contained in:
2025-04-18 15:19:57 -04:00
parent 8f35cedb4a
commit 03b43ce3ab
28 changed files with 3378 additions and 810 deletions

View File

@ -1,29 +1,24 @@
import os
import json
import zipfile
import aiohttp
import asyncio
import time
import sys
from typing import Dict, Any, Optional, Generator
from sqlalchemy.orm import Session
from datetime import datetime
from app.models.mtgjson_card import MTGJSONCard
from app.models.mtgjson_sku import MTGJSONSKU
from app.db.database import get_db, transaction
from app.services.external_api.base_external_service import BaseExternalService
from app.schemas.file import FileInDB
import logging
logger = logging.getLogger(__name__)
class MTGJSONService(BaseExternalService):
def __init__(self, cache_dir: str = "app/data/cache/mtgjson", batch_size: int = 1000):
def __init__(self, cache_dir: str = "app/data/cache/mtgjson"):
super().__init__(base_url="https://mtgjson.com/api/v5/")
# Ensure the cache directory exists
os.makedirs(cache_dir, exist_ok=True)
self.cache_dir = cache_dir
self.identifiers_dir = os.path.join(cache_dir, "identifiers")
self.skus_dir = os.path.join(cache_dir, "skus")
self.batch_size = batch_size
# Create necessary directories
os.makedirs(cache_dir, exist_ok=True)
# Ensure subdirectories exist
os.makedirs(self.identifiers_dir, exist_ok=True)
os.makedirs(self.skus_dir, exist_ok=True)
@ -46,112 +41,133 @@ class MTGJSONService(BaseExternalService):
print(f"Downloading {url}...")
start_time = time.time()
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
if response.status == 200:
file_data = await response.read()
return await self.save_file(
db=db,
file_data=file_data,
file_name=filename,
subdir=f"mtgjson/{subdir}",
file_type=response.headers.get('content-type', 'application/octet-stream')
)
else:
raise Exception(f"Failed to download file from {url}. Status: {response.status}")
# Use the base external service's _make_request method
file_data = await self._make_request(
method="GET",
endpoint=url.replace(self.base_url, ""),
binary=True
)
# Save the file using the file service
return await self.file_service.save_file(
db=db,
file_data=file_data,
filename=filename,
subdir=f"mtgjson/{subdir}",
file_type="application/zip",
content_type="application/zip"
)
async def _unzip_file(self, zip_path: str, extract_dir: str) -> str:
"""Unzip a file to the specified directory and return the path to the extracted JSON file"""
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
json_filename = zip_ref.namelist()[0]
zip_ref.extractall(extract_dir)
return os.path.join(extract_dir, json_filename)
async def _unzip_file(self, file_record: FileInDB, subdir: str, db: Session) -> str:
"""Unzip a file to the specified subdirectory and return the path to the extracted JSON file"""
try:
# Use the appropriate subdirectory based on the type
extract_path = self.identifiers_dir if subdir == "identifiers" else self.skus_dir
os.makedirs(extract_path, exist_ok=True)
with zipfile.ZipFile(file_record.path, 'r') as zip_ref:
json_filename = zip_ref.namelist()[0]
zip_ref.extractall(extract_path)
json_path = os.path.join(extract_path, json_filename)
# Create a file record for the extracted JSON file
with open(json_path, 'r') as f:
json_data = f.read()
json_file_record = await self.file_service.save_file(
db=db,
file_data=json_data,
filename=json_filename,
subdir=f"mtgjson/{subdir}",
file_type="application/json",
content_type="application/json"
)
return str(json_file_record.path)
except Exception as e:
logger.error(f"Error unzipping file: {e}")
raise
def _stream_json_file(self, file_path: str) -> Generator[Dict[str, Any], None, None]:
"""Stream a JSON file and yield items one at a time"""
print(f"Starting to stream JSON file: {file_path}")
with open(file_path, 'r') as f:
# Load the entire file since MTGJSON uses a specific format
data = json.load(f)
# First yield the meta data
if "meta" in data:
yield {"type": "meta", "data": data["meta"]}
# Then yield each item in the data section
if "data" in data:
for key, value in data["data"].items():
yield {"type": "item", "data": {key: value}}
async def _process_batch(self, db: Session, items: list, model_class) -> int:
"""Process a batch of items and add them to the database"""
processed = 0
with transaction(db):
for item in items:
if model_class == MTGJSONCard:
# Check if card already exists
existing_card = db.query(MTGJSONCard).filter(MTGJSONCard.card_id == item["card_id"]).first()
if existing_card:
"""Stream a JSON file and yield items one at a time using a streaming parser"""
logger.info(f"Starting to stream JSON file: {file_path}")
try:
with open(file_path, 'r') as f:
# First, we need to find the start of the data section
data_started = False
current_key = None
current_value = []
brace_count = 0
for line in f:
line = line.strip()
if not line:
continue
new_item = MTGJSONCard(
card_id=item["card_id"],
name=item["name"],
set_code=item["set_code"],
uuid=item["uuid"],
abu_id=item.get("abu_id"),
card_kingdom_etched_id=item.get("card_kingdom_etched_id"),
card_kingdom_foil_id=item.get("card_kingdom_foil_id"),
card_kingdom_id=item.get("card_kingdom_id"),
cardsphere_id=item.get("cardsphere_id"),
cardsphere_foil_id=item.get("cardsphere_foil_id"),
cardtrader_id=item.get("cardtrader_id"),
csi_id=item.get("csi_id"),
mcm_id=item.get("mcm_id"),
mcm_meta_id=item.get("mcm_meta_id"),
miniaturemarket_id=item.get("miniaturemarket_id"),
mtg_arena_id=item.get("mtg_arena_id"),
mtgjson_foil_version_id=item.get("mtgjson_foil_version_id"),
mtgjson_non_foil_version_id=item.get("mtgjson_non_foil_version_id"),
mtgjson_v4_id=item.get("mtgjson_v4_id"),
mtgo_foil_id=item.get("mtgo_foil_id"),
mtgo_id=item.get("mtgo_id"),
multiverse_id=item.get("multiverse_id"),
scg_id=item.get("scg_id"),
scryfall_id=item.get("scryfall_id"),
scryfall_card_back_id=item.get("scryfall_card_back_id"),
scryfall_oracle_id=item.get("scryfall_oracle_id"),
scryfall_illustration_id=item.get("scryfall_illustration_id"),
tcgplayer_product_id=item.get("tcgplayer_product_id"),
tcgplayer_etched_product_id=item.get("tcgplayer_etched_product_id"),
tnt_id=item.get("tnt_id")
)
else: # MTGJSONSKU
# Check if SKU already exists
existing_sku = db.query(MTGJSONSKU).filter(MTGJSONSKU.sku_id == item["sku_id"]).first()
if existing_sku:
continue
new_item = MTGJSONSKU(
sku_id=str(item["sku_id"]),
product_id=str(item["product_id"]),
condition=item["condition"],
finish=item["finish"],
language=item["language"],
printing=item["printing"],
card_id=item["card_id"]
)
db.add(new_item)
processed += 1
if not data_started:
if '"data":' in line:
data_started = True
# Skip the opening brace of the data object
line = line[line.find('"data":') + 7:].strip()
if line.startswith('{'):
line = line[1:].strip()
else:
# Yield meta data if found
if '"meta":' in line:
meta_start = line.find('"meta":') + 7
meta_end = line.rfind('}')
if meta_end > meta_start:
meta_json = line[meta_start:meta_end + 1]
try:
meta_data = json.loads(meta_json)
yield {"type": "meta", "data": meta_data}
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse meta data: {e}")
continue
# Process the data section
if data_started:
if not current_key:
# Look for a new key
if '"' in line:
key_start = line.find('"') + 1
key_end = line.find('"', key_start)
if key_end > key_start:
current_key = line[key_start:key_end]
# Get the rest of the line after the key
line = line[key_end + 1:].strip()
if ':' in line:
line = line[line.find(':') + 1:].strip()
if current_key:
# Accumulate the value
current_value.append(line)
brace_count += line.count('{') - line.count('}')
if brace_count == 0 and line.endswith(','):
# We have a complete value
value_str = ''.join(current_value).rstrip(',')
try:
value = json.loads(value_str)
yield {"type": "item", "data": {current_key: value}}
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse value for key {current_key}: {e}")
current_key = None
current_value = []
except Exception as e:
logger.error(f"Error streaming JSON file: {e}")
raise
return processed
async def download_and_process_identifiers(self, db: Session) -> Dict[str, int]:
"""Download, unzip and process AllIdentifiers.json.zip using streaming"""
self._print_progress("Starting MTGJSON identifiers processing...")
start_time = time.time()
async def get_identifiers(self, db: Session) -> Generator[Dict[str, Any], None, None]:
"""Download and get MTGJSON identifiers data"""
# Check if we have a cached version
cached_file = await self.file_service.get_file_by_filename(db, "AllIdentifiers.json")
if cached_file:
# Ensure the file exists at the path
if os.path.exists(cached_file.path):
return self._stream_json_file(cached_file.path)
# Download the file using FileService
# Download and process the file
file_record = await self._download_file(
db=db,
url="https://mtgjson.com/api/v5/AllIdentifiers.json.zip",
@ -159,87 +175,22 @@ class MTGJSONService(BaseExternalService):
subdir="identifiers"
)
# Get the file path from the database record
zip_path = file_record.path
# Unzip and process the file
json_path = await self._unzip_file(file_record, "identifiers", db)
cards_processed = 0
current_batch = []
total_cards = 0
last_progress_time = time.time()
self._print_progress("Processing cards...")
try:
for item in self._stream_json_file(zip_path):
if item["type"] == "meta":
self._print_progress(f"Processing MTGJSON data version {item['data'].get('version')} from {item['data'].get('date')}")
continue
card_data = item["data"]
card_id = list(card_data.keys())[0]
card_info = card_data[card_id]
total_cards += 1
current_batch.append({
"card_id": card_id,
"name": card_info.get("name"),
"set_code": card_info.get("setCode"),
"uuid": card_info.get("uuid"),
"abu_id": card_info.get("identifiers", {}).get("abuId"),
"card_kingdom_etched_id": card_info.get("identifiers", {}).get("cardKingdomEtchedId"),
"card_kingdom_foil_id": card_info.get("identifiers", {}).get("cardKingdomFoilId"),
"card_kingdom_id": card_info.get("identifiers", {}).get("cardKingdomId"),
"cardsphere_id": card_info.get("identifiers", {}).get("cardsphereId"),
"cardsphere_foil_id": card_info.get("identifiers", {}).get("cardsphereFoilId"),
"cardtrader_id": card_info.get("identifiers", {}).get("cardtraderId"),
"csi_id": card_info.get("identifiers", {}).get("csiId"),
"mcm_id": card_info.get("identifiers", {}).get("mcmId"),
"mcm_meta_id": card_info.get("identifiers", {}).get("mcmMetaId"),
"miniaturemarket_id": card_info.get("identifiers", {}).get("miniaturemarketId"),
"mtg_arena_id": card_info.get("identifiers", {}).get("mtgArenaId"),
"mtgjson_foil_version_id": card_info.get("identifiers", {}).get("mtgjsonFoilVersionId"),
"mtgjson_non_foil_version_id": card_info.get("identifiers", {}).get("mtgjsonNonFoilVersionId"),
"mtgjson_v4_id": card_info.get("identifiers", {}).get("mtgjsonV4Id"),
"mtgo_foil_id": card_info.get("identifiers", {}).get("mtgoFoilId"),
"mtgo_id": card_info.get("identifiers", {}).get("mtgoId"),
"multiverse_id": card_info.get("identifiers", {}).get("multiverseId"),
"scg_id": card_info.get("identifiers", {}).get("scgId"),
"scryfall_id": card_info.get("identifiers", {}).get("scryfallId"),
"scryfall_card_back_id": card_info.get("identifiers", {}).get("scryfallCardBackId"),
"scryfall_oracle_id": card_info.get("identifiers", {}).get("scryfallOracleId"),
"scryfall_illustration_id": card_info.get("identifiers", {}).get("scryfallIllustrationId"),
"tcgplayer_product_id": card_info.get("identifiers", {}).get("tcgplayerProductId"),
"tcgplayer_etched_product_id": card_info.get("identifiers", {}).get("tcgplayerEtchedProductId"),
"tnt_id": card_info.get("identifiers", {}).get("tntId"),
"data": card_info
})
if len(current_batch) >= self.batch_size:
batch_processed = await self._process_batch(db, current_batch, MTGJSONCard)
cards_processed += batch_processed
current_batch = []
current_time = time.time()
if current_time - last_progress_time >= 1.0: # Update progress every second
self._print_progress(f"\r{self._format_progress(cards_processed, total_cards, start_time)}", end="")
last_progress_time = current_time
except Exception as e:
self._print_progress(f"\nError during processing: {str(e)}")
raise
# Process remaining items
if current_batch:
batch_processed = await self._process_batch(db, current_batch, MTGJSONCard)
cards_processed += batch_processed
total_time = time.time() - start_time
self._print_progress(f"\nProcessing complete! Processed {cards_processed} cards in {total_time:.1f} seconds")
return {"cards_processed": cards_processed}
# Return a generator that streams the JSON file
return self._stream_json_file(json_path)
async def download_and_process_skus(self, db: Session) -> Dict[str, int]:
"""Download, unzip and process TcgplayerSkus.json.zip using streaming"""
self._print_progress("Starting MTGJSON SKUs processing...")
start_time = time.time()
async def get_skus(self, db: Session) -> Generator[Dict[str, Any], None, None]:
"""Download and get MTGJSON SKUs data"""
# Check if we have a cached version
cached_file = await self.file_service.get_file_by_filename(db, "TcgplayerSkus.json")
if cached_file:
# Ensure the file exists at the path
if os.path.exists(cached_file.path):
return self._stream_json_file(cached_file.path)
# Download the file using FileService
# Download and process the file
file_record = await self._download_file(
db=db,
url="https://mtgjson.com/api/v5/TcgplayerSkus.json.zip",
@ -247,64 +198,21 @@ class MTGJSONService(BaseExternalService):
subdir="skus"
)
# Get the file path from the database record
zip_path = file_record.path
# Unzip and process the file
json_path = await self._unzip_file(file_record, "skus", db)
skus_processed = 0
current_batch = []
total_skus = 0
last_progress_time = time.time()
self._print_progress("Processing SKUs...")
try:
for item in self._stream_json_file(zip_path):
if item["type"] == "meta":
self._print_progress(f"Processing MTGJSON SKUs version {item['data'].get('version')} from {item['data'].get('date')}")
continue
# The data structure is {card_uuid: [sku1, sku2, ...]}
for card_uuid, sku_list in item["data"].items():
for sku in sku_list:
total_skus += 1
current_batch.append({
"sku_id": str(sku.get("skuId")),
"product_id": str(sku.get("productId")),
"condition": sku.get("condition"),
"finish": sku.get("finish"),
"language": sku.get("language"),
"printing": sku.get("printing"),
"card_id": card_uuid,
"data": sku
})
if len(current_batch) >= self.batch_size:
batch_processed = await self._process_batch(db, current_batch, MTGJSONSKU)
skus_processed += batch_processed
current_batch = []
current_time = time.time()
if current_time - last_progress_time >= 1.0: # Update progress every second
self._print_progress(f"\r{self._format_progress(skus_processed, total_skus, start_time)}", end="")
last_progress_time = current_time
except Exception as e:
self._print_progress(f"\nError during processing: {str(e)}")
raise
# Process remaining items
if current_batch:
batch_processed = await self._process_batch(db, current_batch, MTGJSONSKU)
skus_processed += batch_processed
total_time = time.time() - start_time
self._print_progress(f"\nProcessing complete! Processed {skus_processed} SKUs in {total_time:.1f} seconds")
return {"skus_processed": skus_processed}
# Return a generator that streams the JSON file
return self._stream_json_file(json_path)
async def clear_cache(self) -> None:
async def clear_cache(self, db: Session) -> None:
"""Clear all cached data"""
for subdir in ["identifiers", "skus"]:
dir_path = os.path.join(self.cache_dir, subdir)
if os.path.exists(dir_path):
for filename in os.listdir(dir_path):
file_path = os.path.join(dir_path, filename)
if os.path.isfile(file_path):
os.unlink(file_path)
print("MTGJSON cache cleared")
try:
# Delete all files in the mtgjson subdirectory
files = await self.file_service.list_files(db, file_type=["json", "zip"])
for file in files:
if file.path.startswith("mtgjson/"):
await self.file_service.delete_file(db, file.id)
logger.info("MTGJSON cache cleared")
except Exception as e:
logger.error(f"Error clearing cache: {e}")
raise