data init idk other stuff
This commit is contained in:
@ -1,29 +1,24 @@
|
||||
import os
|
||||
import json
|
||||
import zipfile
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import time
|
||||
import sys
|
||||
from typing import Dict, Any, Optional, Generator
|
||||
from sqlalchemy.orm import Session
|
||||
from datetime import datetime
|
||||
from app.models.mtgjson_card import MTGJSONCard
|
||||
from app.models.mtgjson_sku import MTGJSONSKU
|
||||
from app.db.database import get_db, transaction
|
||||
from app.services.external_api.base_external_service import BaseExternalService
|
||||
from app.schemas.file import FileInDB
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class MTGJSONService(BaseExternalService):
|
||||
def __init__(self, cache_dir: str = "app/data/cache/mtgjson", batch_size: int = 1000):
|
||||
def __init__(self, cache_dir: str = "app/data/cache/mtgjson"):
|
||||
super().__init__(base_url="https://mtgjson.com/api/v5/")
|
||||
# Ensure the cache directory exists
|
||||
os.makedirs(cache_dir, exist_ok=True)
|
||||
self.cache_dir = cache_dir
|
||||
self.identifiers_dir = os.path.join(cache_dir, "identifiers")
|
||||
self.skus_dir = os.path.join(cache_dir, "skus")
|
||||
self.batch_size = batch_size
|
||||
|
||||
# Create necessary directories
|
||||
os.makedirs(cache_dir, exist_ok=True)
|
||||
# Ensure subdirectories exist
|
||||
os.makedirs(self.identifiers_dir, exist_ok=True)
|
||||
os.makedirs(self.skus_dir, exist_ok=True)
|
||||
|
||||
@ -46,112 +41,133 @@ class MTGJSONService(BaseExternalService):
|
||||
print(f"Downloading {url}...")
|
||||
start_time = time.time()
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url) as response:
|
||||
if response.status == 200:
|
||||
file_data = await response.read()
|
||||
return await self.save_file(
|
||||
db=db,
|
||||
file_data=file_data,
|
||||
file_name=filename,
|
||||
subdir=f"mtgjson/{subdir}",
|
||||
file_type=response.headers.get('content-type', 'application/octet-stream')
|
||||
)
|
||||
else:
|
||||
raise Exception(f"Failed to download file from {url}. Status: {response.status}")
|
||||
# Use the base external service's _make_request method
|
||||
file_data = await self._make_request(
|
||||
method="GET",
|
||||
endpoint=url.replace(self.base_url, ""),
|
||||
binary=True
|
||||
)
|
||||
|
||||
# Save the file using the file service
|
||||
return await self.file_service.save_file(
|
||||
db=db,
|
||||
file_data=file_data,
|
||||
filename=filename,
|
||||
subdir=f"mtgjson/{subdir}",
|
||||
file_type="application/zip",
|
||||
content_type="application/zip"
|
||||
)
|
||||
|
||||
async def _unzip_file(self, zip_path: str, extract_dir: str) -> str:
|
||||
"""Unzip a file to the specified directory and return the path to the extracted JSON file"""
|
||||
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
||||
json_filename = zip_ref.namelist()[0]
|
||||
zip_ref.extractall(extract_dir)
|
||||
return os.path.join(extract_dir, json_filename)
|
||||
async def _unzip_file(self, file_record: FileInDB, subdir: str, db: Session) -> str:
|
||||
"""Unzip a file to the specified subdirectory and return the path to the extracted JSON file"""
|
||||
try:
|
||||
# Use the appropriate subdirectory based on the type
|
||||
extract_path = self.identifiers_dir if subdir == "identifiers" else self.skus_dir
|
||||
os.makedirs(extract_path, exist_ok=True)
|
||||
|
||||
with zipfile.ZipFile(file_record.path, 'r') as zip_ref:
|
||||
json_filename = zip_ref.namelist()[0]
|
||||
zip_ref.extractall(extract_path)
|
||||
json_path = os.path.join(extract_path, json_filename)
|
||||
|
||||
# Create a file record for the extracted JSON file
|
||||
with open(json_path, 'r') as f:
|
||||
json_data = f.read()
|
||||
json_file_record = await self.file_service.save_file(
|
||||
db=db,
|
||||
file_data=json_data,
|
||||
filename=json_filename,
|
||||
subdir=f"mtgjson/{subdir}",
|
||||
file_type="application/json",
|
||||
content_type="application/json"
|
||||
)
|
||||
|
||||
return str(json_file_record.path)
|
||||
except Exception as e:
|
||||
logger.error(f"Error unzipping file: {e}")
|
||||
raise
|
||||
|
||||
def _stream_json_file(self, file_path: str) -> Generator[Dict[str, Any], None, None]:
|
||||
"""Stream a JSON file and yield items one at a time"""
|
||||
print(f"Starting to stream JSON file: {file_path}")
|
||||
with open(file_path, 'r') as f:
|
||||
# Load the entire file since MTGJSON uses a specific format
|
||||
data = json.load(f)
|
||||
|
||||
# First yield the meta data
|
||||
if "meta" in data:
|
||||
yield {"type": "meta", "data": data["meta"]}
|
||||
|
||||
# Then yield each item in the data section
|
||||
if "data" in data:
|
||||
for key, value in data["data"].items():
|
||||
yield {"type": "item", "data": {key: value}}
|
||||
|
||||
async def _process_batch(self, db: Session, items: list, model_class) -> int:
|
||||
"""Process a batch of items and add them to the database"""
|
||||
processed = 0
|
||||
with transaction(db):
|
||||
for item in items:
|
||||
if model_class == MTGJSONCard:
|
||||
# Check if card already exists
|
||||
existing_card = db.query(MTGJSONCard).filter(MTGJSONCard.card_id == item["card_id"]).first()
|
||||
if existing_card:
|
||||
"""Stream a JSON file and yield items one at a time using a streaming parser"""
|
||||
logger.info(f"Starting to stream JSON file: {file_path}")
|
||||
try:
|
||||
with open(file_path, 'r') as f:
|
||||
# First, we need to find the start of the data section
|
||||
data_started = False
|
||||
current_key = None
|
||||
current_value = []
|
||||
brace_count = 0
|
||||
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
new_item = MTGJSONCard(
|
||||
card_id=item["card_id"],
|
||||
name=item["name"],
|
||||
set_code=item["set_code"],
|
||||
uuid=item["uuid"],
|
||||
abu_id=item.get("abu_id"),
|
||||
card_kingdom_etched_id=item.get("card_kingdom_etched_id"),
|
||||
card_kingdom_foil_id=item.get("card_kingdom_foil_id"),
|
||||
card_kingdom_id=item.get("card_kingdom_id"),
|
||||
cardsphere_id=item.get("cardsphere_id"),
|
||||
cardsphere_foil_id=item.get("cardsphere_foil_id"),
|
||||
cardtrader_id=item.get("cardtrader_id"),
|
||||
csi_id=item.get("csi_id"),
|
||||
mcm_id=item.get("mcm_id"),
|
||||
mcm_meta_id=item.get("mcm_meta_id"),
|
||||
miniaturemarket_id=item.get("miniaturemarket_id"),
|
||||
mtg_arena_id=item.get("mtg_arena_id"),
|
||||
mtgjson_foil_version_id=item.get("mtgjson_foil_version_id"),
|
||||
mtgjson_non_foil_version_id=item.get("mtgjson_non_foil_version_id"),
|
||||
mtgjson_v4_id=item.get("mtgjson_v4_id"),
|
||||
mtgo_foil_id=item.get("mtgo_foil_id"),
|
||||
mtgo_id=item.get("mtgo_id"),
|
||||
multiverse_id=item.get("multiverse_id"),
|
||||
scg_id=item.get("scg_id"),
|
||||
scryfall_id=item.get("scryfall_id"),
|
||||
scryfall_card_back_id=item.get("scryfall_card_back_id"),
|
||||
scryfall_oracle_id=item.get("scryfall_oracle_id"),
|
||||
scryfall_illustration_id=item.get("scryfall_illustration_id"),
|
||||
tcgplayer_product_id=item.get("tcgplayer_product_id"),
|
||||
tcgplayer_etched_product_id=item.get("tcgplayer_etched_product_id"),
|
||||
tnt_id=item.get("tnt_id")
|
||||
)
|
||||
else: # MTGJSONSKU
|
||||
# Check if SKU already exists
|
||||
existing_sku = db.query(MTGJSONSKU).filter(MTGJSONSKU.sku_id == item["sku_id"]).first()
|
||||
if existing_sku:
|
||||
continue
|
||||
|
||||
new_item = MTGJSONSKU(
|
||||
sku_id=str(item["sku_id"]),
|
||||
product_id=str(item["product_id"]),
|
||||
condition=item["condition"],
|
||||
finish=item["finish"],
|
||||
language=item["language"],
|
||||
printing=item["printing"],
|
||||
card_id=item["card_id"]
|
||||
)
|
||||
db.add(new_item)
|
||||
processed += 1
|
||||
if not data_started:
|
||||
if '"data":' in line:
|
||||
data_started = True
|
||||
# Skip the opening brace of the data object
|
||||
line = line[line.find('"data":') + 7:].strip()
|
||||
if line.startswith('{'):
|
||||
line = line[1:].strip()
|
||||
else:
|
||||
# Yield meta data if found
|
||||
if '"meta":' in line:
|
||||
meta_start = line.find('"meta":') + 7
|
||||
meta_end = line.rfind('}')
|
||||
if meta_end > meta_start:
|
||||
meta_json = line[meta_start:meta_end + 1]
|
||||
try:
|
||||
meta_data = json.loads(meta_json)
|
||||
yield {"type": "meta", "data": meta_data}
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"Failed to parse meta data: {e}")
|
||||
continue
|
||||
|
||||
# Process the data section
|
||||
if data_started:
|
||||
if not current_key:
|
||||
# Look for a new key
|
||||
if '"' in line:
|
||||
key_start = line.find('"') + 1
|
||||
key_end = line.find('"', key_start)
|
||||
if key_end > key_start:
|
||||
current_key = line[key_start:key_end]
|
||||
# Get the rest of the line after the key
|
||||
line = line[key_end + 1:].strip()
|
||||
if ':' in line:
|
||||
line = line[line.find(':') + 1:].strip()
|
||||
|
||||
if current_key:
|
||||
# Accumulate the value
|
||||
current_value.append(line)
|
||||
brace_count += line.count('{') - line.count('}')
|
||||
|
||||
if brace_count == 0 and line.endswith(','):
|
||||
# We have a complete value
|
||||
value_str = ''.join(current_value).rstrip(',')
|
||||
try:
|
||||
value = json.loads(value_str)
|
||||
yield {"type": "item", "data": {current_key: value}}
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"Failed to parse value for key {current_key}: {e}")
|
||||
current_key = None
|
||||
current_value = []
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error streaming JSON file: {e}")
|
||||
raise
|
||||
|
||||
return processed
|
||||
|
||||
async def download_and_process_identifiers(self, db: Session) -> Dict[str, int]:
|
||||
"""Download, unzip and process AllIdentifiers.json.zip using streaming"""
|
||||
self._print_progress("Starting MTGJSON identifiers processing...")
|
||||
start_time = time.time()
|
||||
async def get_identifiers(self, db: Session) -> Generator[Dict[str, Any], None, None]:
|
||||
"""Download and get MTGJSON identifiers data"""
|
||||
# Check if we have a cached version
|
||||
cached_file = await self.file_service.get_file_by_filename(db, "AllIdentifiers.json")
|
||||
if cached_file:
|
||||
# Ensure the file exists at the path
|
||||
if os.path.exists(cached_file.path):
|
||||
return self._stream_json_file(cached_file.path)
|
||||
|
||||
# Download the file using FileService
|
||||
# Download and process the file
|
||||
file_record = await self._download_file(
|
||||
db=db,
|
||||
url="https://mtgjson.com/api/v5/AllIdentifiers.json.zip",
|
||||
@ -159,87 +175,22 @@ class MTGJSONService(BaseExternalService):
|
||||
subdir="identifiers"
|
||||
)
|
||||
|
||||
# Get the file path from the database record
|
||||
zip_path = file_record.path
|
||||
# Unzip and process the file
|
||||
json_path = await self._unzip_file(file_record, "identifiers", db)
|
||||
|
||||
cards_processed = 0
|
||||
current_batch = []
|
||||
total_cards = 0
|
||||
last_progress_time = time.time()
|
||||
|
||||
self._print_progress("Processing cards...")
|
||||
try:
|
||||
for item in self._stream_json_file(zip_path):
|
||||
if item["type"] == "meta":
|
||||
self._print_progress(f"Processing MTGJSON data version {item['data'].get('version')} from {item['data'].get('date')}")
|
||||
continue
|
||||
|
||||
card_data = item["data"]
|
||||
card_id = list(card_data.keys())[0]
|
||||
card_info = card_data[card_id]
|
||||
total_cards += 1
|
||||
|
||||
current_batch.append({
|
||||
"card_id": card_id,
|
||||
"name": card_info.get("name"),
|
||||
"set_code": card_info.get("setCode"),
|
||||
"uuid": card_info.get("uuid"),
|
||||
"abu_id": card_info.get("identifiers", {}).get("abuId"),
|
||||
"card_kingdom_etched_id": card_info.get("identifiers", {}).get("cardKingdomEtchedId"),
|
||||
"card_kingdom_foil_id": card_info.get("identifiers", {}).get("cardKingdomFoilId"),
|
||||
"card_kingdom_id": card_info.get("identifiers", {}).get("cardKingdomId"),
|
||||
"cardsphere_id": card_info.get("identifiers", {}).get("cardsphereId"),
|
||||
"cardsphere_foil_id": card_info.get("identifiers", {}).get("cardsphereFoilId"),
|
||||
"cardtrader_id": card_info.get("identifiers", {}).get("cardtraderId"),
|
||||
"csi_id": card_info.get("identifiers", {}).get("csiId"),
|
||||
"mcm_id": card_info.get("identifiers", {}).get("mcmId"),
|
||||
"mcm_meta_id": card_info.get("identifiers", {}).get("mcmMetaId"),
|
||||
"miniaturemarket_id": card_info.get("identifiers", {}).get("miniaturemarketId"),
|
||||
"mtg_arena_id": card_info.get("identifiers", {}).get("mtgArenaId"),
|
||||
"mtgjson_foil_version_id": card_info.get("identifiers", {}).get("mtgjsonFoilVersionId"),
|
||||
"mtgjson_non_foil_version_id": card_info.get("identifiers", {}).get("mtgjsonNonFoilVersionId"),
|
||||
"mtgjson_v4_id": card_info.get("identifiers", {}).get("mtgjsonV4Id"),
|
||||
"mtgo_foil_id": card_info.get("identifiers", {}).get("mtgoFoilId"),
|
||||
"mtgo_id": card_info.get("identifiers", {}).get("mtgoId"),
|
||||
"multiverse_id": card_info.get("identifiers", {}).get("multiverseId"),
|
||||
"scg_id": card_info.get("identifiers", {}).get("scgId"),
|
||||
"scryfall_id": card_info.get("identifiers", {}).get("scryfallId"),
|
||||
"scryfall_card_back_id": card_info.get("identifiers", {}).get("scryfallCardBackId"),
|
||||
"scryfall_oracle_id": card_info.get("identifiers", {}).get("scryfallOracleId"),
|
||||
"scryfall_illustration_id": card_info.get("identifiers", {}).get("scryfallIllustrationId"),
|
||||
"tcgplayer_product_id": card_info.get("identifiers", {}).get("tcgplayerProductId"),
|
||||
"tcgplayer_etched_product_id": card_info.get("identifiers", {}).get("tcgplayerEtchedProductId"),
|
||||
"tnt_id": card_info.get("identifiers", {}).get("tntId"),
|
||||
"data": card_info
|
||||
})
|
||||
|
||||
if len(current_batch) >= self.batch_size:
|
||||
batch_processed = await self._process_batch(db, current_batch, MTGJSONCard)
|
||||
cards_processed += batch_processed
|
||||
current_batch = []
|
||||
current_time = time.time()
|
||||
if current_time - last_progress_time >= 1.0: # Update progress every second
|
||||
self._print_progress(f"\r{self._format_progress(cards_processed, total_cards, start_time)}", end="")
|
||||
last_progress_time = current_time
|
||||
except Exception as e:
|
||||
self._print_progress(f"\nError during processing: {str(e)}")
|
||||
raise
|
||||
|
||||
# Process remaining items
|
||||
if current_batch:
|
||||
batch_processed = await self._process_batch(db, current_batch, MTGJSONCard)
|
||||
cards_processed += batch_processed
|
||||
|
||||
total_time = time.time() - start_time
|
||||
self._print_progress(f"\nProcessing complete! Processed {cards_processed} cards in {total_time:.1f} seconds")
|
||||
return {"cards_processed": cards_processed}
|
||||
# Return a generator that streams the JSON file
|
||||
return self._stream_json_file(json_path)
|
||||
|
||||
async def download_and_process_skus(self, db: Session) -> Dict[str, int]:
|
||||
"""Download, unzip and process TcgplayerSkus.json.zip using streaming"""
|
||||
self._print_progress("Starting MTGJSON SKUs processing...")
|
||||
start_time = time.time()
|
||||
async def get_skus(self, db: Session) -> Generator[Dict[str, Any], None, None]:
|
||||
"""Download and get MTGJSON SKUs data"""
|
||||
# Check if we have a cached version
|
||||
cached_file = await self.file_service.get_file_by_filename(db, "TcgplayerSkus.json")
|
||||
if cached_file:
|
||||
# Ensure the file exists at the path
|
||||
if os.path.exists(cached_file.path):
|
||||
return self._stream_json_file(cached_file.path)
|
||||
|
||||
# Download the file using FileService
|
||||
# Download and process the file
|
||||
file_record = await self._download_file(
|
||||
db=db,
|
||||
url="https://mtgjson.com/api/v5/TcgplayerSkus.json.zip",
|
||||
@ -247,64 +198,21 @@ class MTGJSONService(BaseExternalService):
|
||||
subdir="skus"
|
||||
)
|
||||
|
||||
# Get the file path from the database record
|
||||
zip_path = file_record.path
|
||||
# Unzip and process the file
|
||||
json_path = await self._unzip_file(file_record, "skus", db)
|
||||
|
||||
skus_processed = 0
|
||||
current_batch = []
|
||||
total_skus = 0
|
||||
last_progress_time = time.time()
|
||||
|
||||
self._print_progress("Processing SKUs...")
|
||||
try:
|
||||
for item in self._stream_json_file(zip_path):
|
||||
if item["type"] == "meta":
|
||||
self._print_progress(f"Processing MTGJSON SKUs version {item['data'].get('version')} from {item['data'].get('date')}")
|
||||
continue
|
||||
|
||||
# The data structure is {card_uuid: [sku1, sku2, ...]}
|
||||
for card_uuid, sku_list in item["data"].items():
|
||||
for sku in sku_list:
|
||||
total_skus += 1
|
||||
current_batch.append({
|
||||
"sku_id": str(sku.get("skuId")),
|
||||
"product_id": str(sku.get("productId")),
|
||||
"condition": sku.get("condition"),
|
||||
"finish": sku.get("finish"),
|
||||
"language": sku.get("language"),
|
||||
"printing": sku.get("printing"),
|
||||
"card_id": card_uuid,
|
||||
"data": sku
|
||||
})
|
||||
|
||||
if len(current_batch) >= self.batch_size:
|
||||
batch_processed = await self._process_batch(db, current_batch, MTGJSONSKU)
|
||||
skus_processed += batch_processed
|
||||
current_batch = []
|
||||
current_time = time.time()
|
||||
if current_time - last_progress_time >= 1.0: # Update progress every second
|
||||
self._print_progress(f"\r{self._format_progress(skus_processed, total_skus, start_time)}", end="")
|
||||
last_progress_time = current_time
|
||||
except Exception as e:
|
||||
self._print_progress(f"\nError during processing: {str(e)}")
|
||||
raise
|
||||
|
||||
# Process remaining items
|
||||
if current_batch:
|
||||
batch_processed = await self._process_batch(db, current_batch, MTGJSONSKU)
|
||||
skus_processed += batch_processed
|
||||
|
||||
total_time = time.time() - start_time
|
||||
self._print_progress(f"\nProcessing complete! Processed {skus_processed} SKUs in {total_time:.1f} seconds")
|
||||
return {"skus_processed": skus_processed}
|
||||
# Return a generator that streams the JSON file
|
||||
return self._stream_json_file(json_path)
|
||||
|
||||
async def clear_cache(self) -> None:
|
||||
async def clear_cache(self, db: Session) -> None:
|
||||
"""Clear all cached data"""
|
||||
for subdir in ["identifiers", "skus"]:
|
||||
dir_path = os.path.join(self.cache_dir, subdir)
|
||||
if os.path.exists(dir_path):
|
||||
for filename in os.listdir(dir_path):
|
||||
file_path = os.path.join(dir_path, filename)
|
||||
if os.path.isfile(file_path):
|
||||
os.unlink(file_path)
|
||||
print("MTGJSON cache cleared")
|
||||
try:
|
||||
# Delete all files in the mtgjson subdirectory
|
||||
files = await self.file_service.list_files(db, file_type=["json", "zip"])
|
||||
for file in files:
|
||||
if file.path.startswith("mtgjson/"):
|
||||
await self.file_service.delete_file(db, file.id)
|
||||
logger.info("MTGJSON cache cleared")
|
||||
except Exception as e:
|
||||
logger.error(f"Error clearing cache: {e}")
|
||||
raise
|
||||
|
Reference in New Issue
Block a user