From 0a9ae6460bf4a7025aa60737a37670d13659dcac Mon Sep 17 00:00:00 2001 From: zman Date: Tue, 5 Mar 2024 21:52:36 -0500 Subject: [PATCH] fixy costco stuff --- scraper/api.py | 108 ++++++++++++++++-- scraper/app.py | 95 ++++++++++++--- scraper/costco.py | 36 +++--- scraper/main.py | 4 +- scraper/models.py | 13 +++ scraper/reddit.py | 4 - scraper/webhook.py | 27 +++-- .../migrations/0002_costcoproduct.py | 27 +++++ .../0003_alter_costcoproduct_sku.py | 18 +++ server/pokemans_app/models.py | 14 ++- server/pokemans_app/serializers.py | 8 +- server/pokemans_app/views.py | 22 +++- server/pokemans_django/urls.py | 3 +- 13 files changed, 315 insertions(+), 64 deletions(-) create mode 100644 server/pokemans_app/migrations/0002_costcoproduct.py create mode 100644 server/pokemans_app/migrations/0003_alter_costcoproduct_sku.py diff --git a/scraper/api.py b/scraper/api.py index bc3bb0b..13c523c 100644 --- a/scraper/api.py +++ b/scraper/api.py @@ -93,7 +93,6 @@ class PostManager: Returns: dict: The response from the API containing the post data. """ - self.log_manager.log(f"Getting post by reddit id: {reddit_id}") response = self.api_request_handler.send_api_request( "GET", f"{self.api_request_handler.api_url}posts/?reddit_id={reddit_id}" ) @@ -109,7 +108,6 @@ class PostManager: Returns: bool: True if the post exists, False otherwise. """ - self.log_manager.log(f"Checking if post exists: {reddit_id}") response = self.get_post_by_reddit_id(reddit_id) if len(response) == 0: return False @@ -125,7 +123,6 @@ class PostManager: Returns: dict: The response from the API after attempting to insert the post data. """ - self.log_manager.log(f"Inserting post: {post.reddit_id}") data = { "reddit_id": post.reddit_id, "title": post.title, @@ -192,7 +189,6 @@ class PostAnalyticsManager: Returns: bool: True if the post meets update requirements, False otherwise. """ - self.log_manager.log(f"Checking update requirements for {reddit_id}") # Specify your desired timezone, e.g., UTC timezone = ZoneInfo("UTC") @@ -207,9 +203,6 @@ class PostAnalyticsManager: post_id = self.post_manager.get_post_by_reddit_id(reddit_id) post_id = post_id[0]["id"] - self.log_manager.log( - f"{self.api_request_handler.api_url}post_analytics/?post={post_id}&time_begin={time_begin_str}&time_end={time_end_str}" - ) response = self.api_request_handler.send_api_request( "GET", @@ -234,7 +227,6 @@ class PostAnalyticsManager: Returns: dict: The response from the API after updating the post's analytics. """ - self.log_manager.log(f"Updating post analytics for {post.reddit_id}") post_id = self.post_manager.get_post_by_reddit_id(post.reddit_id) post_id = post_id[0]["id"] data = { @@ -247,3 +239,103 @@ class PostAnalyticsManager: "POST", f"{self.api_request_handler.api_url}post_analytics/", data=data ) return response + + +class CostcoProductManager: + """ + Manages operations related to Costco products, including retrieval and insertion of product data into a database + via API requests. Utilizes an instance of ApiRequestHandler for API interactions and LoggingManager for logging + operations. + + Attributes: + api_request_handler (ApiRequestHandler): Handles the API requests for interacting with Costco product data. + log_manager (LoggingManager): Manages logging for operations performed by CostcoProductManager. + """ + + def __init__(self, api_request_handler: ApiRequestHandler): + """ + Initializes the CostcoProductManager with an API request handler for making API calls and a logging manager + for logging. + + Parameters: + api_request_handler (ApiRequestHandler): The handler for making API requests. + """ + self.api_request_handler = api_request_handler + self.log_manager = LoggingManager("scraper.log") + + def get_all_costco_products(self) -> list: + """ + Retrieves all Costco products from the database through an API call. + + Returns: + dict: The response from the API containing all Costco products. + """ + self.log_manager.log("Getting all Costco products") + all_products = self.api_request_handler.send_api_request( + "GET", f"{self.api_request_handler.api_url}costco_products/" + ) + return all_products + + def insert_costco_product(self, product) -> dict: + """ + Inserts a new Costco product into the database through an API call. + + Parameters: + product (CostcoProduct): The CostcoProduct object containing the data to insert. + + Returns: + dict: The response from the API after attempting to insert the product data. + """ + self.log_manager.log(f"Inserting Costco product: {product.sku}") + data = { + "sku": product.sku, + "name": product.name, + "price": product.price, + "img_url": product.img_url, + "product_link": product.product_link, + "active": product.active, + } + response = self.api_request_handler.send_api_request( + "POST", f"{self.api_request_handler.api_url}costco_products/", data=data + ) + return response + + def update_costco_product(self, product) -> dict: + """ + Updates an existing Costco product in the database through an API call. + + Parameters: + product (CostcoProduct): The CostcoProduct object containing the updated data. + + Returns: + dict: The response from the API after attempting to update the product data. + """ + self.log_manager.log(f"Updating Costco product: {product.sku}") + data = { + "sku": product.sku, + "name": product.name, + "price": product.price, + "img_url": product.img_url, + "product_link": product.product_link, + "active": product.active, + } + response = self.api_request_handler.send_api_request( + "PUT", f"{self.api_request_handler.api_url}costco_products/{product.sku}/", data=data + ) + return response + + def get_costco_product_by_sku(self, sku: str) -> dict: + """ + Retrieves a Costco product by its SKU from the database through an API call. + + Parameters: + sku (str): The SKU of the product to retrieve. + + Returns: + dict: The response from the API containing the product data. + """ + self.log_manager.log(f"Getting Costco product by SKU: {sku}") + response = self.api_request_handler.send_api_request( + "GET", f"{self.api_request_handler.api_url}costco_products/?sku={sku}" + ) + return response \ No newline at end of file diff --git a/scraper/app.py b/scraper/app.py index 45f0d8f..f59caaf 100644 --- a/scraper/app.py +++ b/scraper/app.py @@ -18,8 +18,7 @@ class Application: submission_manager (SubmissionManager): Manages the processing of Reddit submissions. log_manager (LoggingManager): Centralized logging for the application. scheduler: Manages the scheduling of periodic updates. - thread_manager: Manages threading for asynchronous operations. - update_frequency (int): The frequency, in seconds, at which post analytics should be updated. + costco_manager (CostcoManager): Manages Costco product data. """ def __init__( @@ -30,6 +29,7 @@ class Application: post_manager, post_analytics_manager, submission_manager, + costco_manager, ): """ Initializes the application with all necessary components. @@ -41,19 +41,19 @@ class Application: post_manager (PostManager): The manager for post operations. post_analytics_manager (PostAnalyticsManager): The manager for post analytics operations. submission_manager (SubmissionManager): The manager for processing Reddit submissions. - update_frequency (int): The frequency, in seconds, at which to perform updates. """ self.reddit_monitor = reddit_monitor self.webhook_notifier = webhook_notifier self.api_conn = api_conn self.post_manager = post_manager self.post_analytics_manager = post_analytics_manager + self.costco_manager = costco_manager self.log_manager = LoggingManager("scraper.log") self.submission_manager = submission_manager self.scheduler = Scheduler() # how often should post analytics be updated (call for update and database update are separate) - self.update_analytics_frequency = 60 * 15 - self.scrape_costco_frequency = 60 * 60 + self.update_analytics_frequency = 60 * 15 # every 15 minutes + self.scrape_costco_frequency = 60 * 60 * 4 # every 4 hours def update_analytics(self): """ @@ -62,19 +62,74 @@ class Application: self.log_manager.info("Running periodic analytics update") to_be_updated = self.post_manager.get_posts_from_last_7_days() submissions = self.reddit_monitor.update_submissions(to_be_updated) - self.submission_manager.process_submissions(submissions, self.update_analytics_frequency) - + self.submission_manager.process_submissions( + submissions, self.update_analytics_frequency + ) + def scrape_costco(self): """ - Executes periodic updates for costco products based on the predefined frequency. + Executes periodic updates for Costco products based on the predefined frequency. """ - self.log_manager.info("Running periodic costco scrape") - costco_monitor = CostcoMonitor("https://www.costco.com/CatalogSearch?dept=All&keyword=pokemon") - products = costco_monitor.get_products() + self.log_manager.info("Running periodic Costco scrape") + costco_monitor = CostcoMonitor( + "https://www.costco.com/CatalogSearch?dept=All&keyword=pokemon" + ) + fetched_products = costco_monitor.get_products() costco_monitor.close() - self.log_manager.info(f"Found {len(products)} products on the page") - self.log_manager.info(products) - self.webhook_notifier.costco_notification(products) + + # Fetch existing products from the database, assuming it returns a list directly + existing_products = self.costco_manager.get_all_costco_products() + + # Containers for updates + products_to_update = [] + products_to_insert = [] + + # Mapping existing products for quick lookup + existing_products_map = { + product["sku"]: product for product in existing_products + } + + for product in fetched_products: + existing_product = existing_products_map.get(product.sku) + + if existing_product: + self.log_manager.log(f"Found existing product: {product.sku}") + needs_update = False + # Compare and decide if an update is necessary (for price change, activation/deactivation) + if existing_product["price"] != product.price: + existing_product["price"] = product.price + needs_update = True + if existing_product["active"] != product.active: + existing_product["active"] = product.active + needs_update = True + if needs_update: + products_to_update.append(existing_product) + else: + self.log_manager.log(f"Adding new product: {product.sku}") + products_to_insert.append(product) + + # Update existing products in the database if necessary + for product in products_to_update: + self.costco_manager.update_costco_product(product) + + # Insert new products into the database + for product in products_to_insert: + self.costco_manager.insert_costco_product(product) + + # Optionally, deactivate products not found in the latest fetch + skus_fetched = {product.sku for product in fetched_products} + products_to_deactivate = [ + product + for product in existing_products + if product["sku"] not in skus_fetched and product["active"] + ] + for product in products_to_deactivate: + product["active"] = False + self.costco_manager.update_costco_product(product) + + # Send notifications for new products + for product in products_to_insert: + self.webhook_notifier.costco_notification(product) def add_scheduler_task(self, name, task, interval): """ @@ -95,9 +150,15 @@ class Application: self.log_manager.info("Application started") # tasks - self.add_scheduler_task("update_analytics", self.update_analytics, self.update_analytics_frequency) - self.add_scheduler_task("scrape_costco", self.scrape_costco, self.scrape_costco_frequency) + self.add_scheduler_task( + "update_analytics", self.update_analytics, self.update_analytics_frequency + ) + self.add_scheduler_task( + "scrape_costco", self.scrape_costco, self.scrape_costco_frequency + ) # Stream submissions and process them submissions = self.reddit_monitor.stream_submissions() - self.submission_manager.process_submissions(submissions, self.update_analytics_frequency) + self.submission_manager.process_submissions( + submissions, self.update_analytics_frequency + ) diff --git a/scraper/costco.py b/scraper/costco.py index cc1c906..4a03267 100644 --- a/scraper/costco.py +++ b/scraper/costco.py @@ -1,12 +1,12 @@ from selenium import webdriver -from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException -from webdriver_manager.chrome import ChromeDriverManager from app_log import LoggingManager +from models import Product +import os class CostcoMonitor: @@ -19,6 +19,8 @@ class CostcoMonitor: chrome_options.add_argument("--log-level=3") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") + if os.name == "nt": + chrome_options.add_argument("--disable-gpu") self.driver = webdriver.Chrome(options=chrome_options) self.log_manager = LoggingManager("scraper.log") @@ -30,17 +32,27 @@ class CostcoMonitor: except TimeoutException: self.log_manager.error("Timed out waiting for page to load") - def get_products(self): + def get_products(self, retries=0) -> list[Product]: self.log_manager.info(f"Loading Costco page: {self.url}") self.driver.get(self.url) self.wait_for_page_load() # Wait for the page to fully load # Wait for the product list to be visible on the page - WebDriverWait(self.driver, 20).until( - EC.visibility_of_element_located((By.CSS_SELECTOR, "div.product-list.grid")) - ) - products = self.driver.find_elements(By.CSS_SELECTOR, "div.col-xs-6.col-lg-4.col-xl-3.product") + print("Waiting for product") + try: + WebDriverWait(self.driver, 20).until( + EC.visibility_of_element_located((By.XPATH, "//div[@automation-id='productList']")) + ) + except TimeoutException: + self.log_manager.error("Timed out waiting for product list to load") + if retries < 3: + self.log_manager.info("Retrying...") + self.get_products(retries + 1) + else: + self.log_manager.error("Failed to load product list after 3 retries") + return [] + products = self.driver.find_elements(By.XPATH, "//div[@automation-id='productList']/div[contains(@class, 'product')]") self.log_manager.info(f"Found {len(products)} products on the page") product_detail_list = [] @@ -55,13 +67,7 @@ class CostcoMonitor: img_url = img_element.get_attribute('src') if img_element else "Image URL not found" product_link_element = product.find_element(By.CSS_SELECTOR, "a.product-image-url") product_link = product_link_element.get_attribute('href') if product_link_element else "Product link not found" - product_detail_list.append({ - "sku": product_sku, - "name": product_name, - "price": price, - "img_url": img_url, - "product_link": product_link - }) + product_detail_list.append(Product(product_sku, product_name, price, img_url, product_link)) self.log_manager.log(f"SKU: {product_sku}, Name: {product_name}, Price: {price}, Image URL: {img_url}, Product Link: {product_link}") except Exception as e: @@ -74,7 +80,7 @@ class CostcoMonitor: self.log_manager.info("Browser closed") if __name__ == "__main__": - url = "https://www.costco.com/CatalogSearch?dept=All&keyword=pokemon" + url = "https://www.costco.com/CatalogSearch?dept=All&keyword=bagels" monitor = CostcoMonitor(url) monitor.get_products() monitor.close() diff --git a/scraper/main.py b/scraper/main.py index 3da14e1..26c74b4 100644 --- a/scraper/main.py +++ b/scraper/main.py @@ -1,6 +1,6 @@ from webhook import WebhookNotifier from app import Application -from api import ApiRequestHandler, PostManager, PostAnalyticsManager +from api import ApiRequestHandler, PostManager, PostAnalyticsManager, CostcoProductManager from reddit import RedditMonitor, SubmissionManager from config import Config from app_log import LoggingManager @@ -26,6 +26,7 @@ if __name__ == "__main__": api_conn = ApiRequestHandler(api_url) post_manager = PostManager(api_conn) post_analytics_manager = PostAnalyticsManager(api_conn, post_manager) + costco_manager = CostcoProductManager(api_conn) submission_manager = SubmissionManager( reddit_monitor, post_manager, post_analytics_manager, webhook_notifier ) @@ -36,6 +37,7 @@ if __name__ == "__main__": post_manager, post_analytics_manager, submission_manager, + costco_manager, ) app.run() diff --git a/scraper/models.py b/scraper/models.py index 61efb83..f3f47cb 100644 --- a/scraper/models.py +++ b/scraper/models.py @@ -25,3 +25,16 @@ class Post: def __str__(self): return f"{self.reddit_id} {self.title} {self.name} {self.url} {self.score} {self.num_comments} {self.created_utc} {self.selftext} {self.permalink} {self.upvote_ratio}" + + +class Product: + def __init__(self, sku, name, price, img_url, product_link, active=True): + self.sku = sku + self.name = name + self.price = price + self.img_url = img_url + self.product_link = product_link + self.active = active + + def __str__(self): + return f"{self.sku} {self.name} {self.price} {self.img_url} {self.product_link} {self.active}" diff --git a/scraper/reddit.py b/scraper/reddit.py index cabcda4..04f3819 100644 --- a/scraper/reddit.py +++ b/scraper/reddit.py @@ -139,14 +139,10 @@ class SubmissionManager: update_frequency (int, optional): The minimum frequency in seconds to update a post's analytics. """ for submission in submissions: - self.log_manager.log(submission) if self.post_manager.post_exists(submission.id): - self.log_manager.log("Post exists") - self.log_manager.log(f"post id: {submission.id}") if self.post_analytics_manager.check_update_requirements( submission.id, update_frequency ): - self.log_manager.log("Update requirements met") post = self.convert_submission_to_post(submission) self.post_analytics_manager.update_post_analytics(post) else: diff --git a/scraper/webhook.py b/scraper/webhook.py index 025e349..79c944c 100644 --- a/scraper/webhook.py +++ b/scraper/webhook.py @@ -1,5 +1,6 @@ import requests from app_log import LoggingManager +from models import Product, Post class WebhookNotifier: @@ -26,23 +27,21 @@ class WebhookNotifier: except Exception as e: self.log_manager.error(f"Failed to send notification: {e}") - def costco_notification(self, data): - for product in data: - sku = product.get("sku") - name = product.get("name") - price = product.get("price") - img_url = product.get("img_url") - product_link = product.get("product_link") + def costco_notification(self, product : Product): + name = product.name + price = product.price + product_link = product.product_link + img_url = product.img_url - content = f""" + content = f""" **Costco has a new item!** **Name:** {name} **Price:** {price} **Link:** {product_link} {img_url}""" - if not self.disable_webhook: - self.log_manager.log(f"Sending notification to {self.webhook_url}") - try: - requests.post(self.webhook_url, data={"content": content}) - except Exception as e: - self.log_manager.error(f"Failed to send notification: {e}") \ No newline at end of file + if not self.disable_webhook: + self.log_manager.log(f"Sending notification to {self.webhook_url}") + try: + requests.post(self.webhook_url, data={"content": content}) + except Exception as e: + self.log_manager.error(f"Failed to send notification: {e}") \ No newline at end of file diff --git a/server/pokemans_app/migrations/0002_costcoproduct.py b/server/pokemans_app/migrations/0002_costcoproduct.py new file mode 100644 index 0000000..5a9d9a8 --- /dev/null +++ b/server/pokemans_app/migrations/0002_costcoproduct.py @@ -0,0 +1,27 @@ +# Generated by Django 5.0.2 on 2024-03-06 00:59 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('pokemans_app', '0001_initial'), + ] + + operations = [ + migrations.CreateModel( + name='CostcoProduct', + fields=[ + ('id', models.AutoField(primary_key=True, serialize=False)), + ('sku', models.CharField(max_length=255)), + ('name', models.CharField(max_length=255)), + ('price', models.CharField(max_length=255)), + ('img_url', models.CharField(max_length=555)), + ('product_link', models.CharField(max_length=555)), + ('active', models.BooleanField(default=True)), + ('created_at', models.DateTimeField(auto_now=True)), + ('updated_at', models.DateTimeField(auto_now=True)), + ], + ), + ] diff --git a/server/pokemans_app/migrations/0003_alter_costcoproduct_sku.py b/server/pokemans_app/migrations/0003_alter_costcoproduct_sku.py new file mode 100644 index 0000000..4fdf712 --- /dev/null +++ b/server/pokemans_app/migrations/0003_alter_costcoproduct_sku.py @@ -0,0 +1,18 @@ +# Generated by Django 5.0.2 on 2024-03-06 02:26 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('pokemans_app', '0002_costcoproduct'), + ] + + operations = [ + migrations.AlterField( + model_name='costcoproduct', + name='sku', + field=models.CharField(max_length=255, unique=True), + ), + ] diff --git a/server/pokemans_app/models.py b/server/pokemans_app/models.py index a9566e6..23204f7 100644 --- a/server/pokemans_app/models.py +++ b/server/pokemans_app/models.py @@ -19,4 +19,16 @@ class PostAnalytics(models.Model): num_comments = models.IntegerField() score = models.IntegerField() upvote_ratio = models.FloatField() - created_at = models.DateTimeField(auto_now=True) \ No newline at end of file + created_at = models.DateTimeField(auto_now=True) + + +class CostcoProduct(models.Model): + id = models.AutoField(primary_key=True) + sku = models.CharField(max_length=255, unique=True) + name = models.CharField(max_length=255) + price = models.CharField(max_length=255) + img_url = models.CharField(max_length=555) + product_link = models.CharField(max_length=555) + active = models.BooleanField(default=True) + created_at = models.DateTimeField(auto_now=True) + updated_at = models.DateTimeField(auto_now=True) \ No newline at end of file diff --git a/server/pokemans_app/serializers.py b/server/pokemans_app/serializers.py index 63bf48d..110fcbe 100644 --- a/server/pokemans_app/serializers.py +++ b/server/pokemans_app/serializers.py @@ -1,5 +1,5 @@ from rest_framework import serializers -from .models import Post, PostAnalytics +from .models import Post, PostAnalytics, CostcoProduct class PostSerializer(serializers.ModelSerializer): @@ -10,4 +10,10 @@ class PostSerializer(serializers.ModelSerializer): class PostAnalyticsSerializer(serializers.ModelSerializer): class Meta: model = PostAnalytics + fields = '__all__' + + +class CostcoProductSerializer(serializers.ModelSerializer): + class Meta: + model = CostcoProduct fields = '__all__' \ No newline at end of file diff --git a/server/pokemans_app/views.py b/server/pokemans_app/views.py index 2aee3d5..a74e983 100644 --- a/server/pokemans_app/views.py +++ b/server/pokemans_app/views.py @@ -1,7 +1,7 @@ from django.shortcuts import render from rest_framework import viewsets -from .models import Post, PostAnalytics -from .serializers import PostSerializer, PostAnalyticsSerializer +from .models import Post, PostAnalytics, CostcoProduct +from .serializers import PostSerializer, PostAnalyticsSerializer, CostcoProductSerializer from datetime import timedelta from django.utils import timezone from django.utils.dateparse import parse_datetime @@ -54,4 +54,22 @@ class PostAnalyticsViewSet(viewsets.ModelViewSet): # This is where you could log an error or handle the case where datetime strings are invalid pass + return queryset + + +class CostcoProductViewSet(viewsets.ModelViewSet): + queryset = CostcoProduct.objects.all() + serializer_class = CostcoProductSerializer + + def get_queryset(self): + queryset = CostcoProduct.objects.all() + active = self.request.query_params.get('active', None) + sku = self.request.query_params.get('sku', None) + + if sku is not None: + queryset = queryset.filter(sku=sku) + + if active is not None: + queryset = queryset.filter(active=active) + return queryset \ No newline at end of file diff --git a/server/pokemans_django/urls.py b/server/pokemans_django/urls.py index ef93e59..78d0b02 100644 --- a/server/pokemans_django/urls.py +++ b/server/pokemans_django/urls.py @@ -17,12 +17,13 @@ Including another URLconf from django.contrib import admin from django.urls import path, include from rest_framework.routers import DefaultRouter -from pokemans_app.views import PostViewSet, PostAnalyticsViewSet +from pokemans_app.views import PostViewSet, PostAnalyticsViewSet, CostcoProductViewSet router = DefaultRouter() router.register(r"posts", PostViewSet) router.register(r"post_analytics", PostAnalyticsViewSet) +router.register(r"costco_products", CostcoProductViewSet) urlpatterns = [ path("admin/", admin.site.urls),