fixy costco stuff

This commit is contained in:
zman 2024-03-05 21:52:36 -05:00
parent 2088a30d75
commit 0a9ae6460b
13 changed files with 315 additions and 64 deletions

View File

@ -93,7 +93,6 @@ class PostManager:
Returns:
dict: The response from the API containing the post data.
"""
self.log_manager.log(f"Getting post by reddit id: {reddit_id}")
response = self.api_request_handler.send_api_request(
"GET", f"{self.api_request_handler.api_url}posts/?reddit_id={reddit_id}"
)
@ -109,7 +108,6 @@ class PostManager:
Returns:
bool: True if the post exists, False otherwise.
"""
self.log_manager.log(f"Checking if post exists: {reddit_id}")
response = self.get_post_by_reddit_id(reddit_id)
if len(response) == 0:
return False
@ -125,7 +123,6 @@ class PostManager:
Returns:
dict: The response from the API after attempting to insert the post data.
"""
self.log_manager.log(f"Inserting post: {post.reddit_id}")
data = {
"reddit_id": post.reddit_id,
"title": post.title,
@ -192,7 +189,6 @@ class PostAnalyticsManager:
Returns:
bool: True if the post meets update requirements, False otherwise.
"""
self.log_manager.log(f"Checking update requirements for {reddit_id}")
# Specify your desired timezone, e.g., UTC
timezone = ZoneInfo("UTC")
@ -207,9 +203,6 @@ class PostAnalyticsManager:
post_id = self.post_manager.get_post_by_reddit_id(reddit_id)
post_id = post_id[0]["id"]
self.log_manager.log(
f"{self.api_request_handler.api_url}post_analytics/?post={post_id}&time_begin={time_begin_str}&time_end={time_end_str}"
)
response = self.api_request_handler.send_api_request(
"GET",
@ -234,7 +227,6 @@ class PostAnalyticsManager:
Returns:
dict: The response from the API after updating the post's analytics.
"""
self.log_manager.log(f"Updating post analytics for {post.reddit_id}")
post_id = self.post_manager.get_post_by_reddit_id(post.reddit_id)
post_id = post_id[0]["id"]
data = {
@ -247,3 +239,103 @@ class PostAnalyticsManager:
"POST", f"{self.api_request_handler.api_url}post_analytics/", data=data
)
return response
class CostcoProductManager:
"""
Manages operations related to Costco products, including retrieval and insertion of product data into a database
via API requests. Utilizes an instance of ApiRequestHandler for API interactions and LoggingManager for logging
operations.
Attributes:
api_request_handler (ApiRequestHandler): Handles the API requests for interacting with Costco product data.
log_manager (LoggingManager): Manages logging for operations performed by CostcoProductManager.
"""
def __init__(self, api_request_handler: ApiRequestHandler):
"""
Initializes the CostcoProductManager with an API request handler for making API calls and a logging manager
for logging.
Parameters:
api_request_handler (ApiRequestHandler): The handler for making API requests.
"""
self.api_request_handler = api_request_handler
self.log_manager = LoggingManager("scraper.log")
def get_all_costco_products(self) -> list:
"""
Retrieves all Costco products from the database through an API call.
Returns:
dict: The response from the API containing all Costco products.
"""
self.log_manager.log("Getting all Costco products")
all_products = self.api_request_handler.send_api_request(
"GET", f"{self.api_request_handler.api_url}costco_products/"
)
return all_products
def insert_costco_product(self, product) -> dict:
"""
Inserts a new Costco product into the database through an API call.
Parameters:
product (CostcoProduct): The CostcoProduct object containing the data to insert.
Returns:
dict: The response from the API after attempting to insert the product data.
"""
self.log_manager.log(f"Inserting Costco product: {product.sku}")
data = {
"sku": product.sku,
"name": product.name,
"price": product.price,
"img_url": product.img_url,
"product_link": product.product_link,
"active": product.active,
}
response = self.api_request_handler.send_api_request(
"POST", f"{self.api_request_handler.api_url}costco_products/", data=data
)
return response
def update_costco_product(self, product) -> dict:
"""
Updates an existing Costco product in the database through an API call.
Parameters:
product (CostcoProduct): The CostcoProduct object containing the updated data.
Returns:
dict: The response from the API after attempting to update the product data.
"""
self.log_manager.log(f"Updating Costco product: {product.sku}")
data = {
"sku": product.sku,
"name": product.name,
"price": product.price,
"img_url": product.img_url,
"product_link": product.product_link,
"active": product.active,
}
response = self.api_request_handler.send_api_request(
"PUT", f"{self.api_request_handler.api_url}costco_products/{product.sku}/", data=data
)
return response
def get_costco_product_by_sku(self, sku: str) -> dict:
"""
Retrieves a Costco product by its SKU from the database through an API call.
Parameters:
sku (str): The SKU of the product to retrieve.
Returns:
dict: The response from the API containing the product data.
"""
self.log_manager.log(f"Getting Costco product by SKU: {sku}")
response = self.api_request_handler.send_api_request(
"GET", f"{self.api_request_handler.api_url}costco_products/?sku={sku}"
)
return response

View File

@ -18,8 +18,7 @@ class Application:
submission_manager (SubmissionManager): Manages the processing of Reddit submissions.
log_manager (LoggingManager): Centralized logging for the application.
scheduler: Manages the scheduling of periodic updates.
thread_manager: Manages threading for asynchronous operations.
update_frequency (int): The frequency, in seconds, at which post analytics should be updated.
costco_manager (CostcoManager): Manages Costco product data.
"""
def __init__(
@ -30,6 +29,7 @@ class Application:
post_manager,
post_analytics_manager,
submission_manager,
costco_manager,
):
"""
Initializes the application with all necessary components.
@ -41,19 +41,19 @@ class Application:
post_manager (PostManager): The manager for post operations.
post_analytics_manager (PostAnalyticsManager): The manager for post analytics operations.
submission_manager (SubmissionManager): The manager for processing Reddit submissions.
update_frequency (int): The frequency, in seconds, at which to perform updates.
"""
self.reddit_monitor = reddit_monitor
self.webhook_notifier = webhook_notifier
self.api_conn = api_conn
self.post_manager = post_manager
self.post_analytics_manager = post_analytics_manager
self.costco_manager = costco_manager
self.log_manager = LoggingManager("scraper.log")
self.submission_manager = submission_manager
self.scheduler = Scheduler()
# how often should post analytics be updated (call for update and database update are separate)
self.update_analytics_frequency = 60 * 15
self.scrape_costco_frequency = 60 * 60
self.update_analytics_frequency = 60 * 15 # every 15 minutes
self.scrape_costco_frequency = 60 * 60 * 4 # every 4 hours
def update_analytics(self):
"""
@ -62,19 +62,74 @@ class Application:
self.log_manager.info("Running periodic analytics update")
to_be_updated = self.post_manager.get_posts_from_last_7_days()
submissions = self.reddit_monitor.update_submissions(to_be_updated)
self.submission_manager.process_submissions(submissions, self.update_analytics_frequency)
self.submission_manager.process_submissions(
submissions, self.update_analytics_frequency
)
def scrape_costco(self):
"""
Executes periodic updates for costco products based on the predefined frequency.
Executes periodic updates for Costco products based on the predefined frequency.
"""
self.log_manager.info("Running periodic costco scrape")
costco_monitor = CostcoMonitor("https://www.costco.com/CatalogSearch?dept=All&keyword=pokemon")
products = costco_monitor.get_products()
self.log_manager.info("Running periodic Costco scrape")
costco_monitor = CostcoMonitor(
"https://www.costco.com/CatalogSearch?dept=All&keyword=pokemon"
)
fetched_products = costco_monitor.get_products()
costco_monitor.close()
self.log_manager.info(f"Found {len(products)} products on the page")
self.log_manager.info(products)
self.webhook_notifier.costco_notification(products)
# Fetch existing products from the database, assuming it returns a list directly
existing_products = self.costco_manager.get_all_costco_products()
# Containers for updates
products_to_update = []
products_to_insert = []
# Mapping existing products for quick lookup
existing_products_map = {
product["sku"]: product for product in existing_products
}
for product in fetched_products:
existing_product = existing_products_map.get(product.sku)
if existing_product:
self.log_manager.log(f"Found existing product: {product.sku}")
needs_update = False
# Compare and decide if an update is necessary (for price change, activation/deactivation)
if existing_product["price"] != product.price:
existing_product["price"] = product.price
needs_update = True
if existing_product["active"] != product.active:
existing_product["active"] = product.active
needs_update = True
if needs_update:
products_to_update.append(existing_product)
else:
self.log_manager.log(f"Adding new product: {product.sku}")
products_to_insert.append(product)
# Update existing products in the database if necessary
for product in products_to_update:
self.costco_manager.update_costco_product(product)
# Insert new products into the database
for product in products_to_insert:
self.costco_manager.insert_costco_product(product)
# Optionally, deactivate products not found in the latest fetch
skus_fetched = {product.sku for product in fetched_products}
products_to_deactivate = [
product
for product in existing_products
if product["sku"] not in skus_fetched and product["active"]
]
for product in products_to_deactivate:
product["active"] = False
self.costco_manager.update_costco_product(product)
# Send notifications for new products
for product in products_to_insert:
self.webhook_notifier.costco_notification(product)
def add_scheduler_task(self, name, task, interval):
"""
@ -95,9 +150,15 @@ class Application:
self.log_manager.info("Application started")
# tasks
self.add_scheduler_task("update_analytics", self.update_analytics, self.update_analytics_frequency)
self.add_scheduler_task("scrape_costco", self.scrape_costco, self.scrape_costco_frequency)
self.add_scheduler_task(
"update_analytics", self.update_analytics, self.update_analytics_frequency
)
self.add_scheduler_task(
"scrape_costco", self.scrape_costco, self.scrape_costco_frequency
)
# Stream submissions and process them
submissions = self.reddit_monitor.stream_submissions()
self.submission_manager.process_submissions(submissions, self.update_analytics_frequency)
self.submission_manager.process_submissions(
submissions, self.update_analytics_frequency
)

View File

@ -1,12 +1,12 @@
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
from app_log import LoggingManager
from models import Product
import os
class CostcoMonitor:
@ -19,6 +19,8 @@ class CostcoMonitor:
chrome_options.add_argument("--log-level=3")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
if os.name == "nt":
chrome_options.add_argument("--disable-gpu")
self.driver = webdriver.Chrome(options=chrome_options)
self.log_manager = LoggingManager("scraper.log")
@ -30,17 +32,27 @@ class CostcoMonitor:
except TimeoutException:
self.log_manager.error("Timed out waiting for page to load")
def get_products(self):
def get_products(self, retries=0) -> list[Product]:
self.log_manager.info(f"Loading Costco page: {self.url}")
self.driver.get(self.url)
self.wait_for_page_load() # Wait for the page to fully load
# Wait for the product list to be visible on the page
WebDriverWait(self.driver, 20).until(
EC.visibility_of_element_located((By.CSS_SELECTOR, "div.product-list.grid"))
)
products = self.driver.find_elements(By.CSS_SELECTOR, "div.col-xs-6.col-lg-4.col-xl-3.product")
print("Waiting for product")
try:
WebDriverWait(self.driver, 20).until(
EC.visibility_of_element_located((By.XPATH, "//div[@automation-id='productList']"))
)
except TimeoutException:
self.log_manager.error("Timed out waiting for product list to load")
if retries < 3:
self.log_manager.info("Retrying...")
self.get_products(retries + 1)
else:
self.log_manager.error("Failed to load product list after 3 retries")
return []
products = self.driver.find_elements(By.XPATH, "//div[@automation-id='productList']/div[contains(@class, 'product')]")
self.log_manager.info(f"Found {len(products)} products on the page")
product_detail_list = []
@ -55,13 +67,7 @@ class CostcoMonitor:
img_url = img_element.get_attribute('src') if img_element else "Image URL not found"
product_link_element = product.find_element(By.CSS_SELECTOR, "a.product-image-url")
product_link = product_link_element.get_attribute('href') if product_link_element else "Product link not found"
product_detail_list.append({
"sku": product_sku,
"name": product_name,
"price": price,
"img_url": img_url,
"product_link": product_link
})
product_detail_list.append(Product(product_sku, product_name, price, img_url, product_link))
self.log_manager.log(f"SKU: {product_sku}, Name: {product_name}, Price: {price}, Image URL: {img_url}, Product Link: {product_link}")
except Exception as e:
@ -74,7 +80,7 @@ class CostcoMonitor:
self.log_manager.info("Browser closed")
if __name__ == "__main__":
url = "https://www.costco.com/CatalogSearch?dept=All&keyword=pokemon"
url = "https://www.costco.com/CatalogSearch?dept=All&keyword=bagels"
monitor = CostcoMonitor(url)
monitor.get_products()
monitor.close()

View File

@ -1,6 +1,6 @@
from webhook import WebhookNotifier
from app import Application
from api import ApiRequestHandler, PostManager, PostAnalyticsManager
from api import ApiRequestHandler, PostManager, PostAnalyticsManager, CostcoProductManager
from reddit import RedditMonitor, SubmissionManager
from config import Config
from app_log import LoggingManager
@ -26,6 +26,7 @@ if __name__ == "__main__":
api_conn = ApiRequestHandler(api_url)
post_manager = PostManager(api_conn)
post_analytics_manager = PostAnalyticsManager(api_conn, post_manager)
costco_manager = CostcoProductManager(api_conn)
submission_manager = SubmissionManager(
reddit_monitor, post_manager, post_analytics_manager, webhook_notifier
)
@ -36,6 +37,7 @@ if __name__ == "__main__":
post_manager,
post_analytics_manager,
submission_manager,
costco_manager,
)
app.run()

View File

@ -25,3 +25,16 @@ class Post:
def __str__(self):
return f"{self.reddit_id} {self.title} {self.name} {self.url} {self.score} {self.num_comments} {self.created_utc} {self.selftext} {self.permalink} {self.upvote_ratio}"
class Product:
def __init__(self, sku, name, price, img_url, product_link, active=True):
self.sku = sku
self.name = name
self.price = price
self.img_url = img_url
self.product_link = product_link
self.active = active
def __str__(self):
return f"{self.sku} {self.name} {self.price} {self.img_url} {self.product_link} {self.active}"

View File

@ -139,14 +139,10 @@ class SubmissionManager:
update_frequency (int, optional): The minimum frequency in seconds to update a post's analytics.
"""
for submission in submissions:
self.log_manager.log(submission)
if self.post_manager.post_exists(submission.id):
self.log_manager.log("Post exists")
self.log_manager.log(f"post id: {submission.id}")
if self.post_analytics_manager.check_update_requirements(
submission.id, update_frequency
):
self.log_manager.log("Update requirements met")
post = self.convert_submission_to_post(submission)
self.post_analytics_manager.update_post_analytics(post)
else:

View File

@ -1,5 +1,6 @@
import requests
from app_log import LoggingManager
from models import Product, Post
class WebhookNotifier:
@ -26,13 +27,11 @@ class WebhookNotifier:
except Exception as e:
self.log_manager.error(f"Failed to send notification: {e}")
def costco_notification(self, data):
for product in data:
sku = product.get("sku")
name = product.get("name")
price = product.get("price")
img_url = product.get("img_url")
product_link = product.get("product_link")
def costco_notification(self, product : Product):
name = product.name
price = product.price
product_link = product.product_link
img_url = product.img_url
content = f"""
**Costco has a new item!**

View File

@ -0,0 +1,27 @@
# Generated by Django 5.0.2 on 2024-03-06 00:59
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('pokemans_app', '0001_initial'),
]
operations = [
migrations.CreateModel(
name='CostcoProduct',
fields=[
('id', models.AutoField(primary_key=True, serialize=False)),
('sku', models.CharField(max_length=255)),
('name', models.CharField(max_length=255)),
('price', models.CharField(max_length=255)),
('img_url', models.CharField(max_length=555)),
('product_link', models.CharField(max_length=555)),
('active', models.BooleanField(default=True)),
('created_at', models.DateTimeField(auto_now=True)),
('updated_at', models.DateTimeField(auto_now=True)),
],
),
]

View File

@ -0,0 +1,18 @@
# Generated by Django 5.0.2 on 2024-03-06 02:26
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('pokemans_app', '0002_costcoproduct'),
]
operations = [
migrations.AlterField(
model_name='costcoproduct',
name='sku',
field=models.CharField(max_length=255, unique=True),
),
]

View File

@ -20,3 +20,15 @@ class PostAnalytics(models.Model):
score = models.IntegerField()
upvote_ratio = models.FloatField()
created_at = models.DateTimeField(auto_now=True)
class CostcoProduct(models.Model):
id = models.AutoField(primary_key=True)
sku = models.CharField(max_length=255, unique=True)
name = models.CharField(max_length=255)
price = models.CharField(max_length=255)
img_url = models.CharField(max_length=555)
product_link = models.CharField(max_length=555)
active = models.BooleanField(default=True)
created_at = models.DateTimeField(auto_now=True)
updated_at = models.DateTimeField(auto_now=True)

View File

@ -1,5 +1,5 @@
from rest_framework import serializers
from .models import Post, PostAnalytics
from .models import Post, PostAnalytics, CostcoProduct
class PostSerializer(serializers.ModelSerializer):
@ -11,3 +11,9 @@ class PostAnalyticsSerializer(serializers.ModelSerializer):
class Meta:
model = PostAnalytics
fields = '__all__'
class CostcoProductSerializer(serializers.ModelSerializer):
class Meta:
model = CostcoProduct
fields = '__all__'

View File

@ -1,7 +1,7 @@
from django.shortcuts import render
from rest_framework import viewsets
from .models import Post, PostAnalytics
from .serializers import PostSerializer, PostAnalyticsSerializer
from .models import Post, PostAnalytics, CostcoProduct
from .serializers import PostSerializer, PostAnalyticsSerializer, CostcoProductSerializer
from datetime import timedelta
from django.utils import timezone
from django.utils.dateparse import parse_datetime
@ -55,3 +55,21 @@ class PostAnalyticsViewSet(viewsets.ModelViewSet):
pass
return queryset
class CostcoProductViewSet(viewsets.ModelViewSet):
queryset = CostcoProduct.objects.all()
serializer_class = CostcoProductSerializer
def get_queryset(self):
queryset = CostcoProduct.objects.all()
active = self.request.query_params.get('active', None)
sku = self.request.query_params.get('sku', None)
if sku is not None:
queryset = queryset.filter(sku=sku)
if active is not None:
queryset = queryset.filter(active=active)
return queryset

View File

@ -17,12 +17,13 @@ Including another URLconf
from django.contrib import admin
from django.urls import path, include
from rest_framework.routers import DefaultRouter
from pokemans_app.views import PostViewSet, PostAnalyticsViewSet
from pokemans_app.views import PostViewSet, PostAnalyticsViewSet, CostcoProductViewSet
router = DefaultRouter()
router.register(r"posts", PostViewSet)
router.register(r"post_analytics", PostAnalyticsViewSet)
router.register(r"costco_products", CostcoProductViewSet)
urlpatterns = [
path("admin/", admin.site.urls),