god help me

This commit is contained in:
2025-04-19 22:54:07 -04:00
parent 6178fdd15d
commit 34eac3d954
2864 changed files with 318 additions and 3207 deletions

View File

@ -1,7 +1,7 @@
import os
import json
from datetime import datetime, timedelta
from typing import Optional, List, Dict, Any, Union, Generator, Callable
from typing import Optional, List, Dict, Any, Union, Generator, Callable, AsyncGenerator
from sqlalchemy.orm import Session
from app.models.tcgplayer_group import TCGPlayerGroup
from app.models.tcgplayer_product import TCGPlayerProduct
@ -462,111 +462,37 @@ class DataInitializationService(BaseService):
identifiers_count = 0
skus_count = 0
# Process identifiers
if use_cache:
cached_file = await self.file_service.get_file_by_filename(db, "mtgjson_identifiers.json")
if cached_file and os.path.exists(cached_file.path):
logger.info("MTGJSON identifiers initialized from cache")
identifiers_count = await self._process_streamed_data(
db,
self._stream_json_file(cached_file.path),
"mtgjson_identifiers.json",
"mtgjson",
self.sync_mtgjson_identifiers
)
else:
logger.info("Downloading MTGJSON identifiers from API")
identifiers_count = await self._process_streamed_data(
db,
await mtgjson_service.get_identifiers(db),
"mtgjson_identifiers.json",
"mtgjson",
self.sync_mtgjson_identifiers
)
else:
logger.info("Downloading MTGJSON identifiers from API")
identifiers_count = await self._process_streamed_data(
db,
await mtgjson_service.get_identifiers(db),
"mtgjson_identifiers.json",
"mtgjson",
self.sync_mtgjson_identifiers
)
# Get identifiers data
identifiers_data = await mtgjson_service.get_identifiers(db, use_cache)
if identifiers_data and "data" in identifiers_data:
identifiers_count = await self.sync_mtgjson_identifiers(db, list(identifiers_data["data"].values()))
# Process SKUs
if use_cache:
cached_file = await self.file_service.get_file_by_filename(db, "mtgjson_skus.json")
if cached_file and os.path.exists(cached_file.path):
logger.info("MTGJSON SKUs initialized from cache")
skus_count = await self._process_streamed_data(
db,
self._stream_json_file(cached_file.path),
"mtgjson_skus.json",
"mtgjson",
self.sync_mtgjson_skus
)
else:
logger.info("Downloading MTGJSON SKUs from API")
skus_count = await self._process_streamed_data(
db,
await mtgjson_service.get_skus(db),
"mtgjson_skus.json",
"mtgjson",
self.sync_mtgjson_skus
)
else:
logger.info("Downloading MTGJSON SKUs from API")
skus_count = await self._process_streamed_data(
db,
await mtgjson_service.get_skus(db),
"mtgjson_skus.json",
"mtgjson",
self.sync_mtgjson_skus
)
# Get SKUs data
skus_data = await mtgjson_service.get_skus(db, use_cache)
if skus_data and "data" in skus_data:
skus_count = await self.sync_mtgjson_skus(db, list(skus_data["data"].values()))
return {
"identifiers_processed": identifiers_count,
"skus_processed": skus_count
}
async def _process_streamed_data(
self,
db: Session,
data_stream: Generator[Dict[str, Any], None, None],
filename: str,
subdir: str,
sync_func: Callable
) -> int:
"""Process streamed data and sync to database"""
count = 0
items = []
batch_size = 1000
for item in data_stream:
if item["type"] == "meta":
# Handle meta data separately
continue
count += 1
items.append(item["data"])
# Process in batches
if len(items) >= batch_size:
await sync_func(db, items)
items = []
# Process any remaining items
if items:
await sync_func(db, items)
return count
async def sync_mtgjson_identifiers(self, db: Session, identifiers_data: dict):
async def sync_mtgjson_identifiers(self, db: Session, identifiers_data: List[dict]) -> int:
"""Sync MTGJSON identifiers data to the database"""
from app.models.mtgjson_card import MTGJSONCard
count = 0
with transaction(db):
for card_id, card_data in identifiers_data.items():
for card_data in identifiers_data:
if not isinstance(card_data, dict):
logger.debug(f"Skipping non-dict item: {card_data}")
continue
card_id = card_data.get("uuid")
if not card_id:
logger.debug(f"Skipping item without UUID: {card_data}")
continue
existing_card = db.query(MTGJSONCard).filter(MTGJSONCard.card_id == card_id).first()
if existing_card:
# Update existing card
@ -636,53 +562,47 @@ class DataInitializationService(BaseService):
tnt_id=card_data.get("identifiers", {}).get("tntId")
)
db.add(new_card)
count += 1
async def sync_mtgjson_skus(self, db: Session, skus_data: dict):
return count
async def sync_mtgjson_skus(self, db: Session, skus_data: List[List[dict]]) -> int:
"""Sync MTGJSON SKUs data to the database"""
from app.models.mtgjson_sku import MTGJSONSKU
count = 0
with transaction(db):
for card_uuid, sku_list in skus_data.items():
for sku in sku_list:
# Handle case where sku is a string (skuId)
if isinstance(sku, str):
sku_id = sku
existing_sku = db.query(MTGJSONSKU).filter(MTGJSONSKU.sku_id == sku_id).first()
if existing_sku:
# Update existing SKU
existing_sku.card_id = card_uuid
else:
new_sku = MTGJSONSKU(
sku_id=sku_id,
card_id=card_uuid
)
db.add(new_sku)
# Handle case where sku is a dictionary
for product_data in skus_data:
for sku_data in product_data:
sku_id = sku_data.get("skuId")
if not sku_id:
logger.debug(f"Skipping item without SKU ID: {sku_data}")
continue
existing_sku = db.query(MTGJSONSKU).filter(MTGJSONSKU.sku_id == str(sku_id)).first()
if existing_sku:
# Update existing SKU
for key, value in {
"product_id": sku_data.get("productId"),
"condition": sku_data.get("condition"),
"finish": sku_data.get("finish"),
"language": sku_data.get("language"),
"printing": sku_data.get("printing"),
}.items():
setattr(existing_sku, key, value)
else:
sku_id = str(sku.get("skuId"))
existing_sku = db.query(MTGJSONSKU).filter(MTGJSONSKU.sku_id == sku_id).first()
if existing_sku:
# Update existing SKU
for key, value in {
"product_id": str(sku.get("productId")),
"condition": sku.get("condition"),
"finish": sku.get("finish"),
"language": sku.get("language"),
"printing": sku.get("printing"),
"card_id": card_uuid
}.items():
setattr(existing_sku, key, value)
else:
new_sku = MTGJSONSKU(
sku_id=sku_id,
product_id=str(sku.get("productId")),
condition=sku.get("condition"),
finish=sku.get("finish"),
language=sku.get("language"),
printing=sku.get("printing"),
card_id=card_uuid
)
db.add(new_sku)
new_sku = MTGJSONSKU(
sku_id=sku_id,
product_id=sku_data.get("productId"),
condition=sku_data.get("condition"),
finish=sku_data.get("finish"),
language=sku_data.get("language"),
printing=sku_data.get("printing"),
)
db.add(new_sku)
count += 1
return count
async def initialize_data(
self,

View File

@ -2,7 +2,8 @@ import os
import json
import zipfile
import time
from typing import Dict, Any, Optional, Generator
import shutil
from typing import Dict, Any, Optional
from sqlalchemy.orm import Session
from app.services.external_api.base_external_service import BaseExternalService
from app.schemas.file import FileInDB
@ -11,32 +12,10 @@ import logging
logger = logging.getLogger(__name__)
class MTGJSONService(BaseExternalService):
def __init__(self, cache_dir: str = "app/data/cache/mtgjson"):
def __init__(self):
super().__init__(base_url="https://mtgjson.com/api/v5/")
# Ensure the cache directory exists
os.makedirs(cache_dir, exist_ok=True)
self.cache_dir = cache_dir
self.identifiers_dir = os.path.join(cache_dir, "identifiers")
self.skus_dir = os.path.join(cache_dir, "skus")
# Ensure subdirectories exist
os.makedirs(self.identifiers_dir, exist_ok=True)
os.makedirs(self.skus_dir, exist_ok=True)
def _format_progress(self, current: int, total: int, start_time: float) -> str:
"""Format a progress message with percentage and timing information"""
elapsed = time.time() - start_time
if total > 0:
percent = (current / total) * 100
items_per_second = current / elapsed if elapsed > 0 else 0
eta = (total - current) / items_per_second if items_per_second > 0 else 0
return f"[{current}/{total} ({percent:.1f}%)] {items_per_second:.1f} items/sec, ETA: {eta:.1f}s"
return f"[{current} items] {current/elapsed:.1f} items/sec"
def _print_progress(self, message: str, end: str = "\n") -> None:
"""Print progress message with flush"""
print(message, end=end, flush=True)
async def _download_file(self, db: Session, url: str, filename: str, subdir: str) -> FileInDB:
async def _download_and_unzip_file(self, db: Session, url: str, filename: str, subdir: str) -> FileInDB:
"""Download a file from the given URL and save it using FileService"""
print(f"Downloading {url}...")
start_time = time.time()
@ -49,7 +28,7 @@ class MTGJSONService(BaseExternalService):
)
# Save the file using the file service
return await self.file_service.save_file(
file_record = await self.file_service.save_file(
db=db,
file_data=file_data,
filename=filename,
@ -57,18 +36,24 @@ class MTGJSONService(BaseExternalService):
file_type="application/zip",
content_type="application/zip"
)
# Unzip the file
await self._unzip_file(file_record, subdir, db)
return file_record
async def _unzip_file(self, file_record: FileInDB, subdir: str, db: Session) -> str:
async def _unzip_file(self, file_record: FileInDB, subdir: str, db: Session) -> FileInDB:
"""Unzip a file to the specified subdirectory and return the path to the extracted JSON file"""
try:
# Use the appropriate subdirectory based on the type
extract_path = self.identifiers_dir if subdir == "identifiers" else self.skus_dir
os.makedirs(extract_path, exist_ok=True)
file_service = self.get_service('file')
cache_dir = file_service.base_cache_dir
temp_dir = os.path.join(cache_dir,'mtgjson', subdir, 'temp')
os.makedirs(temp_dir, exist_ok=True)
with zipfile.ZipFile(file_record.path, 'r') as zip_ref:
json_filename = zip_ref.namelist()[0]
zip_ref.extractall(extract_path)
json_path = os.path.join(extract_path, json_filename)
zip_ref.extractall(temp_dir)
json_path = os.path.join(temp_dir, json_filename)
# Create a file record for the extracted JSON file
with open(json_path, 'r') as f:
@ -82,127 +67,57 @@ class MTGJSONService(BaseExternalService):
content_type="application/json"
)
return str(json_file_record.path)
# remove the temp directory
shutil.rmtree(temp_dir)
return json_file_record
except Exception as e:
logger.error(f"Error unzipping file: {e}")
raise
def _stream_json_file(self, file_path: str) -> Generator[Dict[str, Any], None, None]:
"""Stream a JSON file and yield items one at a time using a streaming parser"""
logger.info(f"Starting to stream JSON file: {file_path}")
try:
with open(file_path, 'r') as f:
# First, we need to find the start of the data section
data_started = False
current_key = None
current_value = []
brace_count = 0
for line in f:
line = line.strip()
if not line:
continue
if not data_started:
if '"data":' in line:
data_started = True
# Skip the opening brace of the data object
line = line[line.find('"data":') + 7:].strip()
if line.startswith('{'):
line = line[1:].strip()
else:
# Yield meta data if found
if '"meta":' in line:
meta_start = line.find('"meta":') + 7
meta_end = line.rfind('}')
if meta_end > meta_start:
meta_json = line[meta_start:meta_end + 1]
try:
meta_data = json.loads(meta_json)
yield {"type": "meta", "data": meta_data}
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse meta data: {e}")
continue
# Process the data section
if data_started:
if not current_key:
# Look for a new key
if '"' in line:
key_start = line.find('"') + 1
key_end = line.find('"', key_start)
if key_end > key_start:
current_key = line[key_start:key_end]
# Get the rest of the line after the key
line = line[key_end + 1:].strip()
if ':' in line:
line = line[line.find(':') + 1:].strip()
if current_key:
# Accumulate the value
current_value.append(line)
brace_count += line.count('{') - line.count('}')
if brace_count == 0 and line.endswith(','):
# We have a complete value
value_str = ''.join(current_value).rstrip(',')
try:
value = json.loads(value_str)
yield {"type": "item", "data": {current_key: value}}
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse value for key {current_key}: {e}")
current_key = None
current_value = []
except Exception as e:
logger.error(f"Error streaming JSON file: {e}")
raise
async def get_identifiers(self, db: Session) -> Generator[Dict[str, Any], None, None]:
async def get_identifiers(self, db: Session, use_cache: bool = True) -> Dict[str, Any]:
"""Download and get MTGJSON identifiers data"""
# Check if we have a cached version
cached_file = await self.file_service.get_file_by_filename(db, "AllIdentifiers.json")
if cached_file:
# Ensure the file exists at the path
if os.path.exists(cached_file.path):
return self._stream_json_file(cached_file.path)
# Download and process the file
file_record = await self._download_file(
db=db,
url="https://mtgjson.com/api/v5/AllIdentifiers.json.zip",
filename="AllIdentifiers.json.zip",
subdir="identifiers"
)
# Unzip and process the file
json_path = await self._unzip_file(file_record, "identifiers", db)
# Return a generator that streams the JSON file
return self._stream_json_file(json_path)
if cached_file and os.path.exists(cached_file.path) and use_cache:
with open(cached_file.path, 'r') as f:
logger.debug(f"Loaded identifiers from cache: {cached_file.path}")
return json.load(f)
else:
# Download and process the file
logger.debug(f"Downloading identifiers from MTGJSON")
file_record = await self._download_and_unzip_file(
db=db,
url="https://mtgjson.com/api/v5/AllIdentifiers.json.zip",
filename="AllIdentifiers.json.zip",
subdir="identifiers"
)
async def get_skus(self, db: Session) -> Generator[Dict[str, Any], None, None]:
with open(file_record.path, 'r') as f:
logger.debug(f"Loaded identifiers from MTGJSON: {file_record.path}")
return json.load(f)
async def get_skus(self, db: Session, use_cache: bool = True) -> Dict[str, Any]:
"""Download and get MTGJSON SKUs data"""
# Check if we have a cached version
cached_file = await self.file_service.get_file_by_filename(db, "TcgplayerSkus.json")
if cached_file:
# Ensure the file exists at the path
if os.path.exists(cached_file.path):
return self._stream_json_file(cached_file.path)
# Download and process the file
file_record = await self._download_file(
db=db,
url="https://mtgjson.com/api/v5/TcgplayerSkus.json.zip",
filename="TcgplayerSkus.json.zip",
subdir="skus"
)
# Unzip and process the file
json_path = await self._unzip_file(file_record, "skus", db)
# Return a generator that streams the JSON file
return self._stream_json_file(json_path)
if cached_file and os.path.exists(cached_file.path) and use_cache:
with open(cached_file.path, 'r') as f:
logger.debug(f"Loaded SKUs from cache: {cached_file.path}")
return json.load(f)
else:
# Download and process the file
logger.debug(f"Downloading SKUs from MTGJSON")
file_record = await self._download_and_unzip_file(
db=db,
url="https://mtgjson.com/api/v5/TcgplayerSkus.json.zip",
filename="TcgplayerSkus.json.zip",
subdir="skus"
)
with open(file_record.path, 'r') as f:
logger.debug(f"Loaded SKUs from MTGJSON: {file_record.path}")
return json.load(f)
async def clear_cache(self, db: Session) -> None:
"""Clear all cached data"""

View File

@ -153,7 +153,8 @@ class FileService:
async def get_file_by_filename(self, db: Session, filename: str) -> Optional[FileInDB]:
"""Get a file record from the database by filename"""
file_record = db.query(File).filter(File.name == filename).first()
# get most recent file by filename
file_record = db.query(File).filter(File.name == filename).order_by(File.created_at.desc()).first()
if file_record:
return FileInDB.model_validate(file_record)
return None

View File

@ -0,0 +1,15 @@
from app.services.base_service import BaseService
from sqlalchemy.orm import Session
from app.schemas.file import FileInDB
from typing import Dict, Any
import csv
class ManaboxService(BaseService):
def __init__(self):
super().__init__(None)
async def process_manabox_csv(self, db: Session, csv_file: FileInDB) -> bool:
return True
# Name,Set code,Set name,Collector number,Foil,Rarity,Quantity,ManaBox ID,Scryfall ID,Purchase price,Misprint,Altered,Condition,Language,Purchase price currency