data init idk other stuff

This commit is contained in:
2025-04-18 15:19:57 -04:00
parent 8f35cedb4a
commit 03b43ce3ab
28 changed files with 3378 additions and 810 deletions

View File

@ -1,171 +1,171 @@
import os
import json
from datetime import datetime, timedelta
from typing import Optional, List, Dict, Any
from typing import Optional, List, Dict, Any, Union, Generator, Callable
from sqlalchemy.orm import Session
from app.services.external_api.tcgcsv.tcgcsv_service import TCGCSVService
from app.services.external_api.mtgjson.mtgjson_service import MTGJSONService
from app.models.tcgplayer_group import TCGPlayerGroup
from app.models.tcgplayer_product import TCGPlayerProduct
from app.models.tcgplayer_category import TCGPlayerCategory
from app.services.base_service import BaseService
from app.schemas.file import FileInDB
from app.db.database import transaction
import logging
from app.models.tcgplayer_price_history import TCGPlayerPriceHistory
from sqlalchemy import and_, bindparam, update, insert
import py7zr
import shutil
class DataInitializationService:
def __init__(self, cache_dir: str = "app/data/cache/tcgcsv"):
self.cache_dir = cache_dir
self.categories_dir = os.path.join(cache_dir, "categories")
self.groups_dir = os.path.join(cache_dir, "groups")
self.products_dir = os.path.join(cache_dir, "products")
self.tcgcsv_service = TCGCSVService()
self.mtgjson_service = MTGJSONService()
# Create all necessary directories
os.makedirs(cache_dir, exist_ok=True)
os.makedirs(self.categories_dir, exist_ok=True)
os.makedirs(self.groups_dir, exist_ok=True)
os.makedirs(self.products_dir, exist_ok=True)
logger = logging.getLogger(__name__)
def _get_cache_path(self, filename: str, subdir: str) -> str:
"""Get the full path for a cached file in the specified subdirectory"""
return os.path.join(self.cache_dir, subdir, filename)
async def _cache_categories(self, categories_data: dict):
"""Cache categories data to a JSON file"""
cache_path = self._get_cache_path("categories.json", "categories")
with open(cache_path, 'w') as f:
json.dump(categories_data, f, indent=2)
class DataInitializationService(BaseService):
def __init__(self):
super().__init__(None)
async def _cache_groups(self, game_ids: List[int], groups_data: dict):
for game_id in game_ids:
cache_path = self._get_cache_path(f"groups_{game_id}.json", "groups")
with open(cache_path, 'w') as f:
json.dump(groups_data, f, default=str)
async def _cache_products(self, game_ids: List[int], group_id: int, products_data: list):
for game_id in game_ids:
cache_path = self._get_cache_path(f"products_{game_id}_{group_id}.json", "products")
with open(cache_path, 'w') as f:
json.dump(products_data, f, default=str)
async def _load_cached_categories(self) -> Optional[dict]:
cache_path = self._get_cache_path("categories.json", "categories")
if os.path.exists(cache_path):
with open(cache_path, 'r') as f:
return json.load(f)
return None
async def _load_cached_groups(self, game_ids: List[int]) -> Optional[dict]:
# Try to load cached data for any of the game IDs
for game_id in game_ids:
cache_path = self._get_cache_path(f"groups_{game_id}.json", "groups")
if os.path.exists(cache_path):
with open(cache_path, 'r') as f:
return json.load(f)
return None
async def _load_cached_products(self, game_ids: List[int], group_id: int) -> Optional[list]:
# Try to load cached data for any of the game IDs
for game_id in game_ids:
cache_path = self._get_cache_path(f"products_{game_id}_{group_id}.json", "products")
if os.path.exists(cache_path):
with open(cache_path, 'r') as f:
return json.load(f)
return None
async def initialize_data(
async def _cache_data(
self,
db: Session,
game_ids: List[int],
use_cache: bool = True,
init_categories: bool = True,
init_groups: bool = True,
init_products: bool = True,
init_archived_prices: bool = False,
archived_prices_start_date: Optional[str] = None,
archived_prices_end_date: Optional[str] = None,
init_mtgjson: bool = True
) -> Dict[str, Any]:
"""Initialize TCGPlayer data with configurable steps"""
print("Initializing TCGPlayer data...")
results = {
"categories": 0,
"groups": {},
"products": {},
"archived_prices": False,
"mtgjson": {}
}
data: Union[dict, list],
filename: str,
subdir: str,
default_str: bool = False,
file_type: str = "json",
content_type: str = "application/json",
metadata: Optional[Dict] = None
) -> FileInDB:
"""Generic function to cache data to a JSON file"""
file_data = json.dumps(data, default=str if default_str else None, indent=2)
return await self.file_service.save_file(
db,
file_data,
filename,
subdir,
file_type=file_type,
content_type=content_type,
metadata=metadata
)
if init_categories:
print("\nInitializing categories...")
categories_data = None
if use_cache:
categories_data = await self._load_cached_categories()
async def _load_cached_data(
self,
db: Session,
filename: str
) -> Optional[Dict[str, Any]]:
"""Generic function to load cached data from a JSON file with 7-day expiration"""
file_record = await self.file_service.get_file_by_filename(db, filename)
if file_record:
# Check if cache is expired (7 days)
cache_age = datetime.now() - file_record.created_at
if cache_age.days < 7:
with open(file_record.path, 'r') as f:
return json.load(f)
else:
logger.info(f"Cache expired for {filename}, age: {cache_age.days} days")
# Delete the expired cache file
await self.file_service.delete_file(db, file_record.id)
return None
async def sync_categories(self, db: Session, categories_data: dict):
"""Sync categories data to the database using streaming for large datasets"""
categories = categories_data.get("results", [])
batch_size = 1000 # Process in batches of 1000
total_categories = len(categories)
with transaction(db):
for i in range(0, total_categories, batch_size):
batch = categories[i:i + batch_size]
for category_data in batch:
existing_category = db.query(TCGPlayerCategory).filter(TCGPlayerCategory.category_id == category_data["categoryId"]).first()
if existing_category:
# Update existing category
for key, value in {
"name": category_data["name"],
"display_name": category_data.get("displayName"),
"seo_category_name": category_data.get("seoCategoryName"),
"category_description": category_data.get("categoryDescription"),
"category_page_title": category_data.get("categoryPageTitle"),
"sealed_label": category_data.get("sealedLabel"),
"non_sealed_label": category_data.get("nonSealedLabel"),
"condition_guide_url": category_data.get("conditionGuideUrl"),
"is_scannable": category_data.get("isScannable", False),
"popularity": category_data.get("popularity", 0),
"is_direct": category_data.get("isDirect", False),
"modified_on": datetime.fromisoformat(category_data["modifiedOn"].replace("Z", "+00:00")) if category_data.get("modifiedOn") else None
}.items():
setattr(existing_category, key, value)
else:
new_category = TCGPlayerCategory(
category_id=category_data["categoryId"],
name=category_data["name"],
display_name=category_data.get("displayName"),
seo_category_name=category_data.get("seoCategoryName"),
category_description=category_data.get("categoryDescription"),
category_page_title=category_data.get("categoryPageTitle"),
sealed_label=category_data.get("sealedLabel"),
non_sealed_label=category_data.get("nonSealedLabel"),
condition_guide_url=category_data.get("conditionGuideUrl"),
is_scannable=category_data.get("isScannable", False),
popularity=category_data.get("popularity", 0),
is_direct=category_data.get("isDirect", False),
modified_on=datetime.fromisoformat(category_data["modifiedOn"].replace("Z", "+00:00")) if category_data.get("modifiedOn") else None
)
db.add(new_category)
# Commit after each batch
db.commit()
logger.info(f"Processed {min(i + batch_size, total_categories)}/{total_categories} categories")
if not categories_data:
print("Fetching categories from API...")
categories_data = await self.tcgcsv_service.get_categories()
if use_cache:
await self._cache_categories(categories_data)
if not categories_data.get("success"):
raise Exception(f"Failed to fetch categories: {categories_data.get('errors')}")
# Sync categories to database
categories = categories_data.get("results", [])
synced_categories = []
for category_data in categories:
existing_category = db.query(TCGPlayerCategory).filter(TCGPlayerCategory.category_id == category_data["categoryId"]).first()
if existing_category:
synced_categories.append(existing_category)
else:
new_category = TCGPlayerCategory(
category_id=category_data["categoryId"],
name=category_data["name"],
display_name=category_data.get("displayName"),
seo_category_name=category_data.get("seoCategoryName"),
category_description=category_data.get("categoryDescription"),
category_page_title=category_data.get("categoryPageTitle"),
sealed_label=category_data.get("sealedLabel"),
non_sealed_label=category_data.get("nonSealedLabel"),
condition_guide_url=category_data.get("conditionGuideUrl"),
is_scannable=category_data.get("isScannable", False),
popularity=category_data.get("popularity", 0),
is_direct=category_data.get("isDirect", False),
modified_on=datetime.fromisoformat(category_data["modifiedOn"].replace("Z", "+00:00")) if category_data.get("modifiedOn") else None
)
db.add(new_category)
synced_categories.append(new_category)
db.commit()
results["categories"] = len(synced_categories)
print(f"Synced {len(synced_categories)} categories")
# Process each game ID separately
for game_id in game_ids:
print(f"\nProcessing game ID: {game_id}")
results["groups"][game_id] = 0
results["products"][game_id] = {}
if init_groups:
print(f"Initializing groups for game ID {game_id}...")
groups_data = None
if use_cache:
groups_data = await self._load_cached_groups([game_id])
if not groups_data:
print(f"Fetching groups for game ID {game_id} from API...")
groups_data = await self.tcgcsv_service.get_groups([game_id])
if use_cache:
await self._cache_groups([game_id], groups_data)
if not groups_data.get("success"):
raise Exception(f"Failed to fetch groups for game ID {game_id}: {groups_data.get('errors')}")
# Sync groups to database
groups = groups_data.get("results", [])
synced_groups = []
for group_data in groups:
async def init_categories(self, db: Session, use_cache: bool = True) -> bool:
"""Initialize categories data"""
logger.info("Starting categories initialization")
if use_cache:
categories_data = await self._load_cached_data(db, "categories.json")
if categories_data:
await self.sync_categories(db, categories_data)
logger.info("Categories initialized from cache")
return True
else:
logger.warning("No cached categories data found")
return False
else:
tcgcsv_service = self.get_service('tcgcsv')
categories_data = await tcgcsv_service.get_categories()
# Save the categories data
await self._cache_data(
db,
categories_data,
"categories.json",
"tcgcsv/categories",
file_type="json",
content_type="application/json"
)
await self.sync_categories(db, categories_data)
logger.info("Categories initialized from API")
return True
async def sync_groups(self, db: Session, groups_data: dict):
"""Sync groups data to the database using streaming for large datasets"""
groups = groups_data.get("results", [])
batch_size = 1000 # Process in batches of 1000
total_groups = len(groups)
with transaction(db):
for i in range(0, total_groups, batch_size):
batch = groups[i:i + batch_size]
for group_data in batch:
existing_group = db.query(TCGPlayerGroup).filter(TCGPlayerGroup.group_id == group_data["groupId"]).first()
if existing_group:
synced_groups.append(existing_group)
# Update existing group
for key, value in {
"name": group_data["name"],
"abbreviation": group_data.get("abbreviation"),
"is_supplemental": group_data.get("isSupplemental", False),
"published_on": datetime.fromisoformat(group_data["publishedOn"].replace("Z", "+00:00")) if group_data.get("publishedOn") else None,
"modified_on": datetime.fromisoformat(group_data["modifiedOn"].replace("Z", "+00:00")) if group_data.get("modifiedOn") else None,
"category_id": group_data.get("categoryId")
}.items():
setattr(existing_group, key, value)
else:
new_group = TCGPlayerGroup(
group_id=group_data["groupId"],
@ -177,88 +177,561 @@ class DataInitializationService:
category_id=group_data.get("categoryId")
)
db.add(new_group)
synced_groups.append(new_group)
# Commit after each batch
db.commit()
results["groups"][game_id] = len(synced_groups)
print(f"Synced {len(synced_groups)} groups for game ID {game_id}")
logger.info(f"Processed {min(i + batch_size, total_groups)}/{total_groups} groups")
if init_products:
# Handle products for each group in this game ID
for group in synced_groups:
print(f"Initializing products for group {group.name} (game ID {game_id})...")
products_data = None
if use_cache:
products_data = await self._load_cached_products([game_id], group.group_id)
async def init_groups(self, db: Session, use_cache: bool = True, game_ids: List[int] = None) -> bool:
"""Initialize groups data"""
logger.info(f"Starting groups initialization for game IDs: {game_ids}")
tcgcsv_service = self.get_service('tcgcsv')
for game_id in game_ids:
if use_cache:
groups_data = await self._load_cached_data(db, f"groups_{game_id}.json")
if groups_data:
await self.sync_groups(db, groups_data)
logger.info(f"Groups initialized from cache for game ID {game_id}")
else:
logger.warning(f"No cached groups data found for game ID {game_id}")
return False
else:
groups_data = await tcgcsv_service.get_groups(game_id)
# Save the groups data
await self._cache_data(
db,
groups_data,
f"groups_{game_id}.json",
"tcgcsv/groups",
file_type="json",
content_type="application/json"
)
await self.sync_groups(db, groups_data)
logger.info(f"Groups initialized from API for game ID {game_id}")
return True
async def sync_products(self, db: Session, products_data: str):
"""Sync products data to the database using streaming for large datasets"""
import csv
import io
# Parse CSV data
csv_reader = csv.DictReader(io.StringIO(products_data))
products_list = list(csv_reader)
batch_size = 1000 # Process in batches of 1000
total_products = len(products_list)
with transaction(db):
for i in range(0, total_products, batch_size):
batch = products_list[i:i + batch_size]
for product_data in batch:
existing_product = db.query(TCGPlayerProduct).filter(TCGPlayerProduct.product_id == product_data["productId"]).first()
if existing_product:
# Update existing product
for key, value in {
"name": product_data["name"],
"clean_name": product_data.get("cleanName"),
"image_url": product_data.get("imageUrl"),
"category_id": product_data.get("categoryId"),
"group_id": product_data.get("groupId"),
"url": product_data.get("url"),
"modified_on": datetime.fromisoformat(product_data["modifiedOn"].replace("Z", "+00:00")) if product_data.get("modifiedOn") else None,
"image_count": product_data.get("imageCount", 0),
"ext_rarity": product_data.get("extRarity"),
"ext_number": product_data.get("extNumber"),
"low_price": float(product_data.get("lowPrice")) if product_data.get("lowPrice") else None,
"mid_price": float(product_data.get("midPrice")) if product_data.get("midPrice") else None,
"high_price": float(product_data.get("highPrice")) if product_data.get("highPrice") else None,
"market_price": float(product_data.get("marketPrice")) if product_data.get("marketPrice") else None,
"direct_low_price": float(product_data.get("directLowPrice")) if product_data.get("directLowPrice") else None,
"sub_type_name": product_data.get("subTypeName")
}.items():
setattr(existing_product, key, value)
else:
new_product = TCGPlayerProduct(
product_id=product_data["productId"],
name=product_data["name"],
clean_name=product_data.get("cleanName"),
image_url=product_data.get("imageUrl"),
category_id=product_data.get("categoryId"),
group_id=product_data.get("groupId"),
url=product_data.get("url"),
modified_on=datetime.fromisoformat(product_data["modifiedOn"].replace("Z", "+00:00")) if product_data.get("modifiedOn") else None,
image_count=product_data.get("imageCount", 0),
ext_rarity=product_data.get("extRarity"),
ext_subtype=product_data.get("extSubtype"),
ext_oracle_text=product_data.get("extOracleText"),
ext_number=product_data.get("extNumber"),
low_price=float(product_data.get("lowPrice")) if product_data.get("lowPrice") else None,
mid_price=float(product_data.get("midPrice")) if product_data.get("midPrice") else None,
high_price=float(product_data.get("highPrice")) if product_data.get("highPrice") else None,
market_price=float(product_data.get("marketPrice")) if product_data.get("marketPrice") else None,
direct_low_price=float(product_data.get("directLowPrice")) if product_data.get("directLowPrice") else None,
sub_type_name=product_data.get("subTypeName"),
ext_power=product_data.get("extPower"),
ext_toughness=product_data.get("extToughness"),
ext_flavor_text=product_data.get("extFlavorText")
)
db.add(new_product)
# Commit after each batch
db.commit()
logger.info(f"Processed {min(i + batch_size, total_products)}/{total_products} products")
if not products_data:
print(f"Fetching products for group {group.name} (game ID {game_id}) from API...")
products_data = await self.tcgcsv_service.get_products_and_prices([game_id], group.group_id)
if use_cache:
await self._cache_products([game_id], group.group_id, products_data)
async def init_products(self, db: Session, use_cache: bool = True, game_ids: List[int] = None) -> bool:
"""Initialize products data"""
logger.info(f"Starting products initialization for game IDs: {game_ids}")
tcgcsv_service = self.get_service('tcgcsv')
for game_id in game_ids:
groups = db.query(TCGPlayerGroup).filter(TCGPlayerGroup.category_id == game_id).all()
logger.info(f"Processing {len(groups)} groups for game ID {game_id}")
for group in groups:
if use_cache:
products_data = await self._load_cached_data(db, f"products_{game_id}_{group.group_id}.json")
if products_data:
await self.sync_products(db, products_data)
logger.info(f"Products initialized from cache for group {group.group_id}")
else:
logger.warning(f"No cached products data found for group {group.group_id}")
continue
else:
# Get CSV data from API
csv_data = await tcgcsv_service.get_products_and_prices(game_id, group.group_id)
# Save the CSV file
await self.file_service.save_file(
db,
csv_data,
f"products_{game_id}_{group.group_id}.csv",
"tcgcsv/products",
file_type="csv",
content_type="text/csv"
)
# Parse and sync the CSV data
await self.sync_products(db, csv_data)
logger.info(f"Products initialized from API for group {group.group_id}")
return True
# Sync products to database
synced_products = []
for product_data in products_data:
existing_product = db.query(TCGPlayerProduct).filter(TCGPlayerProduct.product_id == int(product_data["productId"])).first()
if existing_product:
synced_products.append(existing_product)
else:
new_product = TCGPlayerProduct(
product_id=int(product_data["productId"]),
name=product_data["name"],
clean_name=product_data.get("cleanName"),
image_url=product_data.get("imageUrl"),
category_id=int(product_data["categoryId"]),
group_id=int(product_data["groupId"]),
url=product_data.get("url"),
modified_on=datetime.fromisoformat(product_data["modifiedOn"].replace("Z", "+00:00")) if product_data.get("modifiedOn") else None,
image_count=int(product_data.get("imageCount", 0)),
ext_rarity=product_data.get("extRarity"),
ext_number=product_data.get("extNumber"),
low_price=float(product_data.get("lowPrice")) if product_data.get("lowPrice") else None,
mid_price=float(product_data.get("midPrice")) if product_data.get("midPrice") else None,
high_price=float(product_data.get("highPrice")) if product_data.get("highPrice") else None,
market_price=float(product_data.get("marketPrice")) if product_data.get("marketPrice") else None,
direct_low_price=float(product_data.get("directLowPrice")) if product_data.get("directLowPrice") else None,
sub_type_name=product_data.get("subTypeName")
)
db.add(new_product)
synced_products.append(new_product)
db.commit()
results["products"][game_id][group.group_id] = len(synced_products)
print(f"Synced {len(synced_products)} products for group {group.name} (game ID {game_id})")
if init_archived_prices:
if not archived_prices_start_date or not archived_prices_end_date:
raise ValueError("Both start_date and end_date are required for archived prices initialization")
print(f"\nInitializing archived prices from {archived_prices_start_date} to {archived_prices_end_date}...")
await self.tcgcsv_service.get_archived_prices_for_date_range(archived_prices_start_date, archived_prices_end_date)
results["archived_prices"] = True
print("Archived prices initialization completed")
if init_mtgjson:
print("\nInitializing MTGJSON data...")
identifiers_result = await self.mtgjson_service.download_and_process_identifiers(db)
skus_result = await self.mtgjson_service.download_and_process_skus(db)
results["mtgjson"] = {
"cards_processed": identifiers_result["cards_processed"],
"skus_processed": skus_result["skus_processed"]
async def sync_archived_prices(self, db: Session, archived_prices_data: dict, date: datetime):
"""Sync archived prices data to the database using bulk operations.
Note: Historical prices are never updated, only new records are inserted."""
from sqlalchemy import insert
from app.models.tcgplayer_price_history import TCGPlayerPriceHistory
# Prepare data for bulk operations
price_records = []
for price_data in archived_prices_data.get("results", []):
record = {
"product_id": price_data["productId"],
"date": date,
"sub_type_name": price_data["subTypeName"],
"low_price": price_data.get("lowPrice"),
"mid_price": price_data.get("midPrice"),
"high_price": price_data.get("highPrice"),
"market_price": price_data.get("marketPrice"),
"direct_low_price": price_data.get("directLowPrice")
}
price_records.append(record)
if not price_records:
return
# Get existing records in bulk to avoid duplicates
product_ids = [r["product_id"] for r in price_records]
sub_type_names = [r["sub_type_name"] for r in price_records]
existing_records = db.query(TCGPlayerPriceHistory).filter(
TCGPlayerPriceHistory.product_id.in_(product_ids),
TCGPlayerPriceHistory.date == date,
TCGPlayerPriceHistory.sub_type_name.in_(sub_type_names)
).all()
# Filter out existing records
existing_keys = {(r.product_id, r.date, r.sub_type_name) for r in existing_records}
to_insert = [
record for record in price_records
if (record["product_id"], record["date"], record["sub_type_name"]) not in existing_keys
]
# Perform bulk insert for new records only
if to_insert:
stmt = insert(TCGPlayerPriceHistory)
db.execute(stmt, to_insert)
db.commit()
async def init_archived_prices(self, db: Session, start_date: datetime, end_date: datetime, use_cache: bool = True, game_ids: List[int] = None) -> bool:
"""Initialize archived prices data"""
logger.info(f"Starting archived prices initialization from {start_date} to {end_date}")
tcgcsv_service = self.get_service('tcgcsv')
processed_dates = await tcgcsv_service.get_tcgcsv_date_range(start_date, end_date)
logger.info(f"Processing {len(processed_dates)} dates")
# Convert game_ids to set for faster lookups
desired_game_ids = set(game_ids) if game_ids else set()
for date in processed_dates:
date_path = f"app/data/cache/tcgcsv/prices/{date}"
# Check if we already have the data for this date
if use_cache and os.path.exists(date_path):
logger.info(f"Using cached price data for {date}")
else:
logger.info(f"Downloading and processing archived prices for {date}")
# Download and extract the archive
archive_data = await tcgcsv_service.get_archived_prices_for_date(date)
# Save the archive file
file_record = await self.file_service.save_file(
db,
archive_data,
f"prices-{date}.ppmd.7z",
"tcgcsv/prices/zip",
file_type="application/x-7z-compressed",
content_type="application/x-7z-compressed"
)
# Extract the 7z file to a temporary directory
temp_extract_path = f"app/data/cache/tcgcsv/prices/temp_{date}"
os.makedirs(temp_extract_path, exist_ok=True)
with py7zr.SevenZipFile(file_record.path, 'r') as archive:
archive.extractall(path=temp_extract_path)
# Find the date subdirectory in the temp directory
date_subdir = os.path.join(temp_extract_path, str(date))
if os.path.exists(date_subdir):
# Remove existing directory if it exists
if os.path.exists(date_path):
shutil.rmtree(date_path)
# Create the destination directory
os.makedirs(date_path, exist_ok=True)
# Move contents from the date subdirectory to the final path
for item in os.listdir(date_subdir):
src = os.path.join(date_subdir, item)
dst = os.path.join(date_path, item)
os.rename(src, dst)
# Clean up the temporary directory
os.rmdir(date_subdir)
os.rmdir(temp_extract_path)
# Process each category directory
for category_id in os.listdir(date_path):
# Skip categories that aren't in our desired game IDs
if int(category_id) not in desired_game_ids:
continue
category_path = os.path.join(date_path, category_id)
if not os.path.isdir(category_path):
continue
# Process each group directory
for group_id in os.listdir(category_path):
group_path = os.path.join(category_path, group_id)
if not os.path.isdir(group_path):
continue
# Process the prices file
prices_file = os.path.join(group_path, "prices")
if not os.path.exists(prices_file):
continue
try:
with open(prices_file, 'r') as f:
price_data = json.load(f)
if price_data.get("success"):
await self.sync_archived_prices(db, price_data, datetime.strptime(date, "%Y-%m-%d"))
logger.info(f"Processed prices for category {category_id}, group {group_id} on {date}")
except Exception as e:
logger.error(f"Error processing prices file {prices_file}: {str(e)}")
continue
return True
async def init_mtgjson(self, db: Session, use_cache: bool = True) -> Dict[str, Any]:
"""Initialize MTGJSON data"""
logger.info("Starting MTGJSON initialization")
mtgjson_service = self.get_service('mtgjson')
identifiers_count = 0
skus_count = 0
# Process identifiers
if use_cache:
cached_file = await self.file_service.get_file_by_filename(db, "mtgjson_identifiers.json")
if cached_file and os.path.exists(cached_file.path):
logger.info("MTGJSON identifiers initialized from cache")
identifiers_count = await self._process_streamed_data(
db,
self._stream_json_file(cached_file.path),
"mtgjson_identifiers.json",
"mtgjson",
self.sync_mtgjson_identifiers
)
else:
logger.info("Downloading MTGJSON identifiers from API")
identifiers_count = await self._process_streamed_data(
db,
await mtgjson_service.get_identifiers(db),
"mtgjson_identifiers.json",
"mtgjson",
self.sync_mtgjson_identifiers
)
else:
logger.info("Downloading MTGJSON identifiers from API")
identifiers_count = await self._process_streamed_data(
db,
await mtgjson_service.get_identifiers(db),
"mtgjson_identifiers.json",
"mtgjson",
self.sync_mtgjson_identifiers
)
# Process SKUs
if use_cache:
cached_file = await self.file_service.get_file_by_filename(db, "mtgjson_skus.json")
if cached_file and os.path.exists(cached_file.path):
logger.info("MTGJSON SKUs initialized from cache")
skus_count = await self._process_streamed_data(
db,
self._stream_json_file(cached_file.path),
"mtgjson_skus.json",
"mtgjson",
self.sync_mtgjson_skus
)
else:
logger.info("Downloading MTGJSON SKUs from API")
skus_count = await self._process_streamed_data(
db,
await mtgjson_service.get_skus(db),
"mtgjson_skus.json",
"mtgjson",
self.sync_mtgjson_skus
)
else:
logger.info("Downloading MTGJSON SKUs from API")
skus_count = await self._process_streamed_data(
db,
await mtgjson_service.get_skus(db),
"mtgjson_skus.json",
"mtgjson",
self.sync_mtgjson_skus
)
return {
"identifiers_processed": identifiers_count,
"skus_processed": skus_count
}
async def _process_streamed_data(
self,
db: Session,
data_stream: Generator[Dict[str, Any], None, None],
filename: str,
subdir: str,
sync_func: Callable
) -> int:
"""Process streamed data and sync to database"""
count = 0
items = []
batch_size = 1000
for item in data_stream:
if item["type"] == "meta":
# Handle meta data separately
continue
count += 1
items.append(item["data"])
# Process in batches
if len(items) >= batch_size:
await sync_func(db, items)
items = []
# Process any remaining items
if items:
await sync_func(db, items)
return count
async def sync_mtgjson_identifiers(self, db: Session, identifiers_data: dict):
"""Sync MTGJSON identifiers data to the database"""
from app.models.mtgjson_card import MTGJSONCard
with transaction(db):
for card_id, card_data in identifiers_data.items():
existing_card = db.query(MTGJSONCard).filter(MTGJSONCard.card_id == card_id).first()
if existing_card:
# Update existing card
for key, value in {
"name": card_data.get("name"),
"set_code": card_data.get("setCode"),
"uuid": card_data.get("uuid"),
"abu_id": card_data.get("identifiers", {}).get("abuId"),
"card_kingdom_etched_id": card_data.get("identifiers", {}).get("cardKingdomEtchedId"),
"card_kingdom_foil_id": card_data.get("identifiers", {}).get("cardKingdomFoilId"),
"card_kingdom_id": card_data.get("identifiers", {}).get("cardKingdomId"),
"cardsphere_id": card_data.get("identifiers", {}).get("cardsphereId"),
"cardsphere_foil_id": card_data.get("identifiers", {}).get("cardsphereFoilId"),
"cardtrader_id": card_data.get("identifiers", {}).get("cardtraderId"),
"csi_id": card_data.get("identifiers", {}).get("csiId"),
"mcm_id": card_data.get("identifiers", {}).get("mcmId"),
"mcm_meta_id": card_data.get("identifiers", {}).get("mcmMetaId"),
"miniaturemarket_id": card_data.get("identifiers", {}).get("miniaturemarketId"),
"mtg_arena_id": card_data.get("identifiers", {}).get("mtgArenaId"),
"mtgjson_foil_version_id": card_data.get("identifiers", {}).get("mtgjsonFoilVersionId"),
"mtgjson_non_foil_version_id": card_data.get("identifiers", {}).get("mtgjsonNonFoilVersionId"),
"mtgjson_v4_id": card_data.get("identifiers", {}).get("mtgjsonV4Id"),
"mtgo_foil_id": card_data.get("identifiers", {}).get("mtgoFoilId"),
"mtgo_id": card_data.get("identifiers", {}).get("mtgoId"),
"multiverse_id": card_data.get("identifiers", {}).get("multiverseId"),
"scg_id": card_data.get("identifiers", {}).get("scgId"),
"scryfall_id": card_data.get("identifiers", {}).get("scryfallId"),
"scryfall_card_back_id": card_data.get("identifiers", {}).get("scryfallCardBackId"),
"scryfall_oracle_id": card_data.get("identifiers", {}).get("scryfallOracleId"),
"scryfall_illustration_id": card_data.get("identifiers", {}).get("scryfallIllustrationId"),
"tcgplayer_product_id": card_data.get("identifiers", {}).get("tcgplayerProductId"),
"tcgplayer_etched_product_id": card_data.get("identifiers", {}).get("tcgplayerEtchedProductId"),
"tnt_id": card_data.get("identifiers", {}).get("tntId")
}.items():
setattr(existing_card, key, value)
else:
new_card = MTGJSONCard(
card_id=card_id,
name=card_data.get("name"),
set_code=card_data.get("setCode"),
uuid=card_data.get("uuid"),
abu_id=card_data.get("identifiers", {}).get("abuId"),
card_kingdom_etched_id=card_data.get("identifiers", {}).get("cardKingdomEtchedId"),
card_kingdom_foil_id=card_data.get("identifiers", {}).get("cardKingdomFoilId"),
card_kingdom_id=card_data.get("identifiers", {}).get("cardKingdomId"),
cardsphere_id=card_data.get("identifiers", {}).get("cardsphereId"),
cardsphere_foil_id=card_data.get("identifiers", {}).get("cardsphereFoilId"),
cardtrader_id=card_data.get("identifiers", {}).get("cardtraderId"),
csi_id=card_data.get("identifiers", {}).get("csiId"),
mcm_id=card_data.get("identifiers", {}).get("mcmId"),
mcm_meta_id=card_data.get("identifiers", {}).get("mcmMetaId"),
miniaturemarket_id=card_data.get("identifiers", {}).get("miniaturemarketId"),
mtg_arena_id=card_data.get("identifiers", {}).get("mtgArenaId"),
mtgjson_foil_version_id=card_data.get("identifiers", {}).get("mtgjsonFoilVersionId"),
mtgjson_non_foil_version_id=card_data.get("identifiers", {}).get("mtgjsonNonFoilVersionId"),
mtgjson_v4_id=card_data.get("identifiers", {}).get("mtgjsonV4Id"),
mtgo_foil_id=card_data.get("identifiers", {}).get("mtgoFoilId"),
mtgo_id=card_data.get("identifiers", {}).get("mtgoId"),
multiverse_id=card_data.get("identifiers", {}).get("multiverseId"),
scg_id=card_data.get("identifiers", {}).get("scgId"),
scryfall_id=card_data.get("identifiers", {}).get("scryfallId"),
scryfall_card_back_id=card_data.get("identifiers", {}).get("scryfallCardBackId"),
scryfall_oracle_id=card_data.get("identifiers", {}).get("scryfallOracleId"),
scryfall_illustration_id=card_data.get("identifiers", {}).get("scryfallIllustrationId"),
tcgplayer_product_id=card_data.get("identifiers", {}).get("tcgplayerProductId"),
tcgplayer_etched_product_id=card_data.get("identifiers", {}).get("tcgplayerEtchedProductId"),
tnt_id=card_data.get("identifiers", {}).get("tntId")
)
db.add(new_card)
async def sync_mtgjson_skus(self, db: Session, skus_data: dict):
"""Sync MTGJSON SKUs data to the database"""
from app.models.mtgjson_sku import MTGJSONSKU
with transaction(db):
for card_uuid, sku_list in skus_data.items():
for sku in sku_list:
# Handle case where sku is a string (skuId)
if isinstance(sku, str):
sku_id = sku
existing_sku = db.query(MTGJSONSKU).filter(MTGJSONSKU.sku_id == sku_id).first()
if existing_sku:
# Update existing SKU
existing_sku.card_id = card_uuid
else:
new_sku = MTGJSONSKU(
sku_id=sku_id,
card_id=card_uuid
)
db.add(new_sku)
# Handle case where sku is a dictionary
else:
sku_id = str(sku.get("skuId"))
existing_sku = db.query(MTGJSONSKU).filter(MTGJSONSKU.sku_id == sku_id).first()
if existing_sku:
# Update existing SKU
for key, value in {
"product_id": str(sku.get("productId")),
"condition": sku.get("condition"),
"finish": sku.get("finish"),
"language": sku.get("language"),
"printing": sku.get("printing"),
"card_id": card_uuid
}.items():
setattr(existing_sku, key, value)
else:
new_sku = MTGJSONSKU(
sku_id=sku_id,
product_id=str(sku.get("productId")),
condition=sku.get("condition"),
finish=sku.get("finish"),
language=sku.get("language"),
printing=sku.get("printing"),
card_id=card_uuid
)
db.add(new_sku)
async def initialize_data(
self,
db: Session,
game_ids: List[int],
use_cache: bool = False,
init_categories: bool = True,
init_groups: bool = True,
init_products: bool = True,
init_archived_prices: bool = True,
archived_prices_start_date: Optional[str] = None,
archived_prices_end_date: Optional[str] = None,
init_mtgjson: bool = True
) -> Dict[str, Any]:
"""Initialize 3rd party API data loads with configurable steps"""
logger.info("Starting data initialization process")
results = {}
if init_categories:
logger.info("Initializing categories...")
results["categories"] = await self.init_categories(db, use_cache)
if init_groups:
logger.info("Initializing groups...")
results["groups"] = await self.init_groups(db, use_cache, game_ids)
if init_products:
logger.info("Initializing products...")
results["products"] = await self.init_products(db, use_cache, game_ids)
if init_archived_prices:
logger.info("Initializing archived prices...")
results["archived_prices"] = await self.init_archived_prices(
db,
archived_prices_start_date,
archived_prices_end_date,
use_cache,
game_ids
)
if init_mtgjson:
logger.info("Initializing MTGJSON data...")
results["mtgjson"] = await self.init_mtgjson(db, use_cache)
logger.info("Data initialization completed")
return results
async def clear_cache(self) -> None:
async def clear_cache(self, db: Session) -> None:
"""Clear all cached data"""
# Delete all files in categories, groups, and products directories
for subdir in ["categories", "groups", "products"]:
dir_path = os.path.join(self.cache_dir, subdir)
if os.path.exists(dir_path):
for filename in os.listdir(dir_path):
file_path = os.path.join(dir_path, filename)
if os.path.isfile(file_path):
os.unlink(file_path)
files = await self.file_service.list_files(db, file_type="json")
for file in files:
if file.path.startswith(subdir):
await self.file_service.delete_file(db, file.id)
await self.mtgjson_service.clear_cache()
print("Cache cleared")
async def close(self):
await self.tcgcsv_service.close()
print("Cache cleared")