add costco scraper

This commit is contained in:
zman 2024-03-05 17:03:40 -05:00
parent ffe3157b3c
commit 2088a30d75
12 changed files with 617 additions and 453 deletions

2
.gitignore vendored
View File

@ -2,4 +2,4 @@
__pycache__
.venv
*.sqlite3
*.log
*.log*

View File

@ -3,6 +3,31 @@ FROM python:3.11
# Set environment variables
ENV PYTHONDONTWRITEBYTECODE 1
ENV PYTHONUNBUFFERED 1
ENV DEBIAN_FRONTEND noninteractive
# Install dependencies for Chrome
RUN apt-get update && apt-get install -y wget gnupg2 ca-certificates unzip \
&& wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - \
&& echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \
&& apt-get update
# Install Google Chrome
RUN apt-get install -y google-chrome-stable
# Install specific version of ChromeDriver
ARG CHROMEDRIVER_VERSION=122.0.6261.94
RUN wget -N https://storage.googleapis.com/chrome-for-testing-public/$CHROMEDRIVER_VERSION/linux64/chromedriver-linux64.zip -P ~/ \
&& unzip ~/chromedriver-linux64.zip -d ~/ \
&& rm ~/chromedriver-linux64.zip \
&& mv -f ~/chromedriver-linux64/chromedriver /usr/local/bin/chromedriver \
&& chown root:root /usr/local/bin/chromedriver \
&& chmod 0755 /usr/local/bin/chromedriver
# Set display port to avoid crash
ENV DISPLAY=:99
# Upgrade pip
RUN pip install --upgrade pip
# Set the working directory in the container
WORKDIR /app

0
scraper/__init__.py Normal file
View File

249
scraper/api.py Normal file
View File

@ -0,0 +1,249 @@
import requests
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo
from exceptions import APIRequestError, InvalidDataTypeError, InvalidMethodError
from app_log import LoggingManager
from models import Post
class ApiRequestHandler:
"""
Handles API requests for the application. Supports basic HTTP methods: GET, POST, PUT, DELETE.
Utilizes the `requests` library to send requests to a specified API URL and handles
response validation and error logging.
Attributes:
api_url (str): The base URL for the API to which requests are sent.
log_manager (LoggingManager): Manages logging for API request operations.
"""
def __init__(self, api_url: str):
self.api_url = api_url
self.log_manager = LoggingManager("scraper.log")
def send_api_request(
self, method: str, api_url: str, data=None, params=None
) -> dict:
"""
Sends a request to the API using the specified HTTP method, URL, and optional data and parameters.
Parameters:
method (str): The HTTP method to use for the request. Must be one of: GET, POST, PUT, DELETE.
api_url (str): The URL endpoint to send the request to.
data (dict, optional): The payload to send in the request body.
params (dict, optional): The URL parameters to append to the request.
Returns:
dict: The JSON response from the API.
Raises:
InvalidMethodError: If the provided method is not supported.
InvalidDataTypeError: If `data` or `params` is provided but is not a dictionary.
APIRequestError: If the response from the API is not a success.
"""
if method not in ["GET", "POST", "PUT", "DELETE"]:
raise InvalidMethodError(f"Invalid method: {method}")
if data is not None and not isinstance(data, dict):
raise InvalidDataTypeError(f"Invalid data type: {type(data)} expected dict")
if params is not None and not isinstance(params, dict):
raise InvalidDataTypeError(
f"Invalid data type: {type(params)} expected dict"
)
try:
response = requests.request(method, api_url, data=data, params=params)
except requests.RequestException as e:
self.log_manager.error(f"API request failed: {e}")
raise APIRequestError(0, str(e))
success_codes = [200, 201, 204]
if response.status_code not in success_codes:
self.log_manager.error(
f"API request failed: {response.status_code} - {response.text}"
)
raise APIRequestError(response.status_code, response.text)
return response.json()
class PostManager:
"""
Manages operations related to posts, including retrieval and insertion of post data into a database via API requests.
Utilizes an instance of ApiRequestHandler for API interactions and LoggingManager for logging operations.
Attributes:
api_request_handler (ApiRequestHandler): Handles the API requests for interacting with post data.
log_manager (LoggingManager): Manages logging for operations performed by PostManager.
"""
def __init__(self, api_request_handler: ApiRequestHandler):
"""
Initializes the PostManager with an API request handler for making API calls and a logging manager for logging.
Parameters:
api_request_handler (ApiRequestHandler): The handler for making API requests.
"""
self.api_request_handler = api_request_handler
self.log_manager = LoggingManager("scraper.log")
def get_post_by_reddit_id(self, reddit_id: str) -> dict:
"""
Retrieves a post by its Reddit ID from the database through an API call.
Parameters:
reddit_id (str): The Reddit ID of the post to retrieve.
Returns:
dict: The response from the API containing the post data.
"""
self.log_manager.log(f"Getting post by reddit id: {reddit_id}")
response = self.api_request_handler.send_api_request(
"GET", f"{self.api_request_handler.api_url}posts/?reddit_id={reddit_id}"
)
return response
def post_exists(self, reddit_id: str) -> bool:
"""
Checks if a post with the specified Reddit ID exists in the database.
Parameters:
reddit_id (str): The Reddit ID of the post to check.
Returns:
bool: True if the post exists, False otherwise.
"""
self.log_manager.log(f"Checking if post exists: {reddit_id}")
response = self.get_post_by_reddit_id(reddit_id)
if len(response) == 0:
return False
return True
def insert_post(self, post) -> dict:
"""
Inserts a new post into the database through an API call.
Parameters:
post (Post): The Post object containing the data to insert.
Returns:
dict: The response from the API after attempting to insert the post data.
"""
self.log_manager.log(f"Inserting post: {post.reddit_id}")
data = {
"reddit_id": post.reddit_id,
"title": post.title,
"name": post.name,
"url": post.url,
"created_utc": post.created_utc,
"selftext": post.selftext,
"permalink": post.permalink,
}
response = self.api_request_handler.send_api_request(
"POST", f"{self.api_request_handler.api_url}posts/", data=data
)
return response
def get_posts_from_last_7_days(self) -> dict:
"""
Retrieves posts from the last 7 days from the database through an API call.
Returns:
dict: The response from the API containing the posts from the last 7 days.
"""
self.log_manager.log("Getting posts from last 7 days")
posts_from_last_7_days = self.api_request_handler.send_api_request(
"GET", f"{self.api_request_handler.api_url}posts/?last_7_days=1"
)
return posts_from_last_7_days
class PostAnalyticsManager:
"""
Manages the analytics for posts by interfacing with the API to check for update requirements
and update post analytics. This class leverages the ApiRequestHandler for API interactions
and the PostManager for retrieving specific post information.
Attributes:
api_request_handler (ApiRequestHandler): Handles API requests for analytics data.
post_manager (PostManager): Manages post retrieval and existence checks.
log_manager (LoggingManager): Manages logging for analytics operations.
"""
def __init__(
self, api_request_handler: ApiRequestHandler, post_manager: PostManager
):
"""
Initializes the PostAnalyticsManager with necessary handlers and managers.
Parameters:
api_request_handler (ApiRequestHandler): The API request handler for making API calls.
post_manager (PostManager): The manager for interacting with post data.
"""
self.api_request_handler = api_request_handler
self.post_manager = post_manager
self.log_manager = LoggingManager("scraper.log")
def check_update_requirements(self, reddit_id: str, update_frequency: int) -> bool:
"""
Checks if the post identified by the given reddit_id meets the requirements for an update
by analyzing the analytics data within the last x seconds (update_frequency).
Parameters:
reddit_id (str): The Reddit ID of the post to check.
update_frequency (int): The frequency in seconds for updating post analytics.
Returns:
bool: True if the post meets update requirements, False otherwise.
"""
self.log_manager.log(f"Checking update requirements for {reddit_id}")
# Specify your desired timezone, e.g., UTC
timezone = ZoneInfo("UTC")
# Make your datetime objects timezone-aware
time_start = datetime.now(timezone) - timedelta(seconds=update_frequency)
now = datetime.now(timezone)
# Format datetime objects for the API request
time_begin_str = time_start.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
time_end_str = now.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
post_id = self.post_manager.get_post_by_reddit_id(reddit_id)
post_id = post_id[0]["id"]
self.log_manager.log(
f"{self.api_request_handler.api_url}post_analytics/?post={post_id}&time_begin={time_begin_str}&time_end={time_end_str}"
)
response = self.api_request_handler.send_api_request(
"GET",
f"{self.api_request_handler.api_url}post_analytics/?post={post_id}&time_begin={time_begin_str}&time_end={time_end_str}",
)
if len(response) > 0:
# post should not be updated
return False
# post should be updated
return True
def update_post_analytics(self, post: Post) -> dict:
"""
Updates the analytics for a given post with new data such as score, number of comments,
and upvote ratio.
Parameters:
post (Post): The post object containing the new analytics data.
Returns:
dict: The response from the API after updating the post's analytics.
"""
self.log_manager.log(f"Updating post analytics for {post.reddit_id}")
post_id = self.post_manager.get_post_by_reddit_id(post.reddit_id)
post_id = post_id[0]["id"]
data = {
"post": post_id,
"score": post.score,
"num_comments": post.num_comments,
"upvote_ratio": post.upvote_ratio,
}
response = self.api_request_handler.send_api_request(
"POST", f"{self.api_request_handler.api_url}post_analytics/", data=data
)
return response

View File

@ -1,406 +1,6 @@
from datetime import datetime, timedelta
import requests
from models import Post
import praw
from zoneinfo import ZoneInfo
from exceptions import InvalidMethodError, InvalidDataTypeError, APIRequestError
from app_log import LoggingManager
from threads import Scheduler, ThreadManager
class ApiRequestHandler:
"""
Handles API requests for the application. Supports basic HTTP methods: GET, POST, PUT, DELETE.
Utilizes the `requests` library to send requests to a specified API URL and handles
response validation and error logging.
Attributes:
api_url (str): The base URL for the API to which requests are sent.
log_manager (LoggingManager): Manages logging for API request operations.
"""
def __init__(self, api_url: str):
self.api_url = api_url
self.log_manager = LoggingManager("scraper.log")
def send_api_request(
self, method: str, api_url: str, data=None, params=None
) -> dict:
"""
Sends a request to the API using the specified HTTP method, URL, and optional data and parameters.
Parameters:
method (str): The HTTP method to use for the request. Must be one of: GET, POST, PUT, DELETE.
api_url (str): The URL endpoint to send the request to.
data (dict, optional): The payload to send in the request body.
params (dict, optional): The URL parameters to append to the request.
Returns:
dict: The JSON response from the API.
Raises:
InvalidMethodError: If the provided method is not supported.
InvalidDataTypeError: If `data` or `params` is provided but is not a dictionary.
APIRequestError: If the response from the API is not a success.
"""
if method not in ["GET", "POST", "PUT", "DELETE"]:
raise InvalidMethodError(f"Invalid method: {method}")
if data is not None and not isinstance(data, dict):
raise InvalidDataTypeError(f"Invalid data type: {type(data)} expected dict")
if params is not None and not isinstance(params, dict):
raise InvalidDataTypeError(
f"Invalid data type: {type(params)} expected dict"
)
try:
response = requests.request(method, api_url, data=data, params=params)
except requests.RequestException as e:
self.log_manager.error(f"API request failed: {e}")
raise APIRequestError(0, str(e))
success_codes = [200, 201, 204]
if response.status_code not in success_codes:
self.log_manager.error(
f"API request failed: {response.status_code} - {response.text}"
)
raise APIRequestError(response.status_code, response.text)
return response.json()
class PostManager:
"""
Manages operations related to posts, including retrieval and insertion of post data into a database via API requests.
Utilizes an instance of ApiRequestHandler for API interactions and LoggingManager for logging operations.
Attributes:
api_request_handler (ApiRequestHandler): Handles the API requests for interacting with post data.
log_manager (LoggingManager): Manages logging for operations performed by PostManager.
"""
def __init__(self, api_request_handler: ApiRequestHandler):
"""
Initializes the PostManager with an API request handler for making API calls and a logging manager for logging.
Parameters:
api_request_handler (ApiRequestHandler): The handler for making API requests.
"""
self.api_request_handler = api_request_handler
self.log_manager = LoggingManager("scraper.log")
def get_post_by_reddit_id(self, reddit_id: str) -> dict:
"""
Retrieves a post by its Reddit ID from the database through an API call.
Parameters:
reddit_id (str): The Reddit ID of the post to retrieve.
Returns:
dict: The response from the API containing the post data.
"""
self.log_manager.log(f"Getting post by reddit id: {reddit_id}")
response = self.api_request_handler.send_api_request(
"GET", f"{self.api_request_handler.api_url}posts/?reddit_id={reddit_id}"
)
return response
def post_exists(self, reddit_id: str) -> bool:
"""
Checks if a post with the specified Reddit ID exists in the database.
Parameters:
reddit_id (str): The Reddit ID of the post to check.
Returns:
bool: True if the post exists, False otherwise.
"""
self.log_manager.log(f"Checking if post exists: {reddit_id}")
response = self.get_post_by_reddit_id(reddit_id)
if len(response) == 0:
return False
return True
def insert_post(self, post) -> dict:
"""
Inserts a new post into the database through an API call.
Parameters:
post (Post): The Post object containing the data to insert.
Returns:
dict: The response from the API after attempting to insert the post data.
"""
self.log_manager.log(f"Inserting post: {post.reddit_id}")
data = {
"reddit_id": post.reddit_id,
"title": post.title,
"name": post.name,
"url": post.url,
"created_utc": post.created_utc,
"selftext": post.selftext,
"permalink": post.permalink,
}
response = self.api_request_handler.send_api_request(
"POST", f"{self.api_request_handler.api_url}posts/", data=data
)
return response
def get_posts_from_last_7_days(self) -> dict:
"""
Retrieves posts from the last 7 days from the database through an API call.
Returns:
dict: The response from the API containing the posts from the last 7 days.
"""
self.log_manager.log("Getting posts from last 7 days")
posts_from_last_7_days = self.api_request_handler.send_api_request(
"GET", f"{self.api_request_handler.api_url}posts/?last_7_days=1"
)
return posts_from_last_7_days
class PostAnalyticsManager:
"""
Manages the analytics for posts by interfacing with the API to check for update requirements
and update post analytics. This class leverages the ApiRequestHandler for API interactions
and the PostManager for retrieving specific post information.
Attributes:
api_request_handler (ApiRequestHandler): Handles API requests for analytics data.
post_manager (PostManager): Manages post retrieval and existence checks.
log_manager (LoggingManager): Manages logging for analytics operations.
"""
def __init__(
self, api_request_handler: ApiRequestHandler, post_manager: PostManager
):
"""
Initializes the PostAnalyticsManager with necessary handlers and managers.
Parameters:
api_request_handler (ApiRequestHandler): The API request handler for making API calls.
post_manager (PostManager): The manager for interacting with post data.
"""
self.api_request_handler = api_request_handler
self.post_manager = post_manager
self.log_manager = LoggingManager("scraper.log")
def check_update_requirements(self, reddit_id: str, update_frequency: int) -> bool:
"""
Checks if the post identified by the given reddit_id meets the requirements for an update
by analyzing the analytics data within the last x seconds (update_frequency).
Parameters:
reddit_id (str): The Reddit ID of the post to check.
update_frequency (int): The frequency in seconds for updating post analytics.
Returns:
bool: True if the post meets update requirements, False otherwise.
"""
self.log_manager.log(f"Checking update requirements for {reddit_id}")
# Specify your desired timezone, e.g., UTC
timezone = ZoneInfo("UTC")
# Make your datetime objects timezone-aware
time_start = datetime.now(timezone) - timedelta(seconds=update_frequency)
now = datetime.now(timezone)
# Format datetime objects for the API request
time_begin_str = time_start.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
time_end_str = now.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
post_id = self.post_manager.get_post_by_reddit_id(reddit_id)
post_id = post_id[0]["id"]
self.log_manager.log(
f"{self.api_request_handler.api_url}post_analytics/?post={post_id}&time_begin={time_begin_str}&time_end={time_end_str}"
)
response = self.api_request_handler.send_api_request(
"GET",
f"{self.api_request_handler.api_url}post_analytics/?post={post_id}&time_begin={time_begin_str}&time_end={time_end_str}",
)
if len(response) > 0:
# post should not be updated
return False
# post should be updated
return True
def update_post_analytics(self, post: Post) -> dict:
"""
Updates the analytics for a given post with new data such as score, number of comments,
and upvote ratio.
Parameters:
post (Post): The post object containing the new analytics data.
Returns:
dict: The response from the API after updating the post's analytics.
"""
self.log_manager.log(f"Updating post analytics for {post.reddit_id}")
post_id = self.post_manager.get_post_by_reddit_id(post.reddit_id)
post_id = post_id[0]["id"]
data = {
"post": post_id,
"score": post.score,
"num_comments": post.num_comments,
"upvote_ratio": post.upvote_ratio,
}
response = self.api_request_handler.send_api_request(
"POST", f"{self.api_request_handler.api_url}post_analytics/", data=data
)
return response
class RedditMonitor:
"""
Monitors Reddit submissions for a specific subreddit, streaming new submissions and
updating existing ones. Utilizes PRAW (Python Reddit API Wrapper) to interact with Reddit's API.
Attributes:
reddit (praw.Reddit): An instance of the PRAW Reddit class for API interactions.
subreddit (praw.models.Subreddit): The subreddit object for the specified subreddit.
log_manager (LoggingManager): Manages logging for Reddit monitoring operations.
"""
def __init__(
self, client_id, client_secret, user_agent, username, password, subreddit_name
):
"""
Initializes the RedditMonitor with credentials for Reddit API access and the target subreddit.
Parameters:
client_id (str): The client ID for the Reddit API application.
client_secret (str): The client secret for the Reddit API application.
user_agent (str): The user agent string identifying the application to Reddit.
username (str): The Reddit account username for authentication.
password (str): The Reddit account password for authentication.
subreddit_name (str): The name of the subreddit to monitor.
"""
self.reddit = praw.Reddit(
client_id=client_id,
client_secret=client_secret,
user_agent=user_agent,
username=username,
password=password,
)
self.subreddit = self.reddit.subreddit(subreddit_name)
self.log_manager = LoggingManager("scraper.log")
def stream_submissions(self):
"""
Streams new submissions from the specified subreddit, yielding each submission
as it becomes available.
Yields:
praw.models.Submission: A submission object representing a Reddit post.
"""
self.log_manager.info("Starting submission stream")
for submission in self.subreddit.stream.submissions():
yield submission
def update_submissions(self, posts_to_update):
"""
Retrieves and yields submissions corresponding to a list of posts that need to be updated,
identified by their Reddit IDs.
Parameters:
posts_to_update (list of dict): A list of dictionaries, each containing the 'reddit_id' of a post to update.
Yields:
praw.models.Submission: A submission object for each post that needs to be updated.
"""
self.log_manager.info("Updating submissions")
for post in posts_to_update:
submission = self.reddit.submission(id=post["reddit_id"])
yield submission
class SubmissionManager:
"""
Manages the processing of Reddit submissions, including conversion to post objects,
checking for updates, and notifying via webhook. It integrates closely with RedditMonitor,
PostManager, and PostAnalyticsManager to streamline the handling of new and existing submissions.
Attributes:
reddit_monitor (RedditMonitor): Monitors and streams Reddit submissions.
post_manager (PostManager): Manages post data interactions.
post_analytics_manager (PostAnalyticsManager): Manages post analytics data.
webhook_notifier (WebhookNotifier): Handles notifications for new or updated posts.
log_manager (LoggingManager): Manages logging for submission processing operations.
"""
def __init__(
self,
reddit_monitor: RedditMonitor,
post_manager: PostManager,
post_analytics_manager: PostAnalyticsManager,
WebhookNotifier,
):
"""
Initializes the SubmissionManager with necessary components for processing submissions.
Parameters:
reddit_monitor (RedditMonitor): The component for monitoring Reddit submissions.
post_manager (PostManager): The component for managing post data.
post_analytics_manager (PostAnalyticsManager): The component for managing post analytics.
WebhookNotifier: The component for sending notifications about posts.
"""
self.reddit_monitor = reddit_monitor
self.post_manager = post_manager
self.post_analytics_manager = post_analytics_manager
self.webhook_notifier = WebhookNotifier
self.log_manager = LoggingManager("scraper.log")
def convert_submission_to_post(self, submission):
"""
Converts a Reddit submission object into a Post object suitable for database insertion
or analytics processing.
Parameters:
submission (praw.models.Submission): The Reddit submission to convert.
Returns:
Post: A Post object populated with data from the Reddit submission.
"""
post = Post(
reddit_id=submission.id,
title=submission.title,
name=submission.name,
url=submission.url,
score=submission.score,
num_comments=submission.num_comments,
created_utc=submission.created_utc,
selftext=submission.selftext,
permalink=submission.permalink,
upvote_ratio=submission.upvote_ratio,
)
return post
def process_submissions(self, submissions, update_frequency=None):
"""
Processes a stream of Reddit submissions, checking for their existence, updating analytics,
and notifying via webhook if necessary. Optionally respects an update frequency to limit updates.
Parameters:
submissions (Iterable[praw.models.Submission]): An iterable of Reddit submission objects to process.
update_frequency (int, optional): The minimum frequency in seconds to update a post's analytics.
"""
for submission in submissions:
self.log_manager.log(submission)
if self.post_manager.post_exists(submission.id):
self.log_manager.log("Post exists")
self.log_manager.log(f"post id: {submission.id}")
if self.post_analytics_manager.check_update_requirements(
submission.id, update_frequency
):
self.log_manager.log("Update requirements met")
post = self.convert_submission_to_post(submission)
self.post_analytics_manager.update_post_analytics(post)
else:
post = self.convert_submission_to_post(submission)
self.post_manager.insert_post(post)
self.post_analytics_manager.update_post_analytics(post)
self.webhook_notifier.send_notification(post)
from threads import Scheduler
from costco import CostcoMonitor
class Application:
@ -430,7 +30,6 @@ class Application:
post_manager,
post_analytics_manager,
submission_manager,
update_frequency,
):
"""
Initializes the application with all necessary components.
@ -451,31 +50,42 @@ class Application:
self.post_analytics_manager = post_analytics_manager
self.log_manager = LoggingManager("scraper.log")
self.submission_manager = submission_manager
self.scheduler = None
self.thread_manager = None
self.scheduler = Scheduler()
# how often should post analytics be updated (call for update and database update are separate)
self.update_frequency = update_frequency
self.update_analytics_frequency = 60 * 15
self.scrape_costco_frequency = 60 * 60
def periodic_update(self, update_frequency):
def update_analytics(self):
"""
Executes periodic updates for post analytics based on a predefined frequency.
Parameters:
update_frequency (int): The frequency, in seconds, at which to perform updates.
Executes periodic updates for post analytics based on the predefined frequency.
"""
self.log_manager.info("Running periodic update")
self.log_manager.info("Running periodic analytics update")
to_be_updated = self.post_manager.get_posts_from_last_7_days()
submissions = self.reddit_monitor.update_submissions(to_be_updated)
self.submission_manager.process_submissions(submissions, update_frequency)
self.submission_manager.process_submissions(submissions, self.update_analytics_frequency)
def run_periodic_update(self):
def scrape_costco(self):
"""
Initializes and runs the scheduler for periodic updates.
Executes periodic updates for costco products based on the predefined frequency.
"""
self.scheduler = Scheduler(
self.update_frequency, lambda: self.periodic_update(self.update_frequency)
)
self.scheduler.run()
self.log_manager.info("Running periodic costco scrape")
costco_monitor = CostcoMonitor("https://www.costco.com/CatalogSearch?dept=All&keyword=pokemon")
products = costco_monitor.get_products()
costco_monitor.close()
self.log_manager.info(f"Found {len(products)} products on the page")
self.log_manager.info(products)
self.webhook_notifier.costco_notification(products)
def add_scheduler_task(self, name, task, interval):
"""
Registers a task with the scheduler to be run at a specified interval.
Parameters:
name (str): Name of the task.
task (callable): The task function to be executed.
interval (int): The frequency in seconds at which the task should be executed.
"""
self.scheduler.add_task(name, task, interval)
def run(self):
"""
@ -483,7 +93,11 @@ class Application:
and processing submissions.
"""
self.log_manager.info("Application started")
self.thread_manager = ThreadManager(target=self.run_periodic_update, args=())
self.thread_manager.run()
# tasks
self.add_scheduler_task("update_analytics", self.update_analytics, self.update_analytics_frequency)
self.add_scheduler_task("scrape_costco", self.scrape_costco, self.scrape_costco_frequency)
# Stream submissions and process them
submissions = self.reddit_monitor.stream_submissions()
self.submission_manager.process_submissions(submissions, self.update_frequency)
self.submission_manager.process_submissions(submissions, self.update_analytics_frequency)

View File

@ -12,4 +12,3 @@ class Config:
USER_AGENT = "praw:zman.video_repost_bot:v0.1.0 (by u/jzman21)"
DISABLE_WEBHOOK = False
API_URL = "http://server:8000/api/"
UPDATE_FREQUENCY = 60 * 15 # 15 minutes

80
scraper/costco.py Normal file
View File

@ -0,0 +1,80 @@
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
from app_log import LoggingManager
class CostcoMonitor:
def __init__(self, url):
self.url = url
chrome_options = Options()
chrome_options.add_argument("--headless") # Remove this line if you want to see the browser
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--log-level=3")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
self.driver = webdriver.Chrome(options=chrome_options)
self.log_manager = LoggingManager("scraper.log")
def wait_for_page_load(self):
try:
WebDriverWait(self.driver, 20).until(
lambda driver: driver.execute_script("return document.readyState") == "complete"
)
except TimeoutException:
self.log_manager.error("Timed out waiting for page to load")
def get_products(self):
self.log_manager.info(f"Loading Costco page: {self.url}")
self.driver.get(self.url)
self.wait_for_page_load() # Wait for the page to fully load
# Wait for the product list to be visible on the page
WebDriverWait(self.driver, 20).until(
EC.visibility_of_element_located((By.CSS_SELECTOR, "div.product-list.grid"))
)
products = self.driver.find_elements(By.CSS_SELECTOR, "div.col-xs-6.col-lg-4.col-xl-3.product")
self.log_manager.info(f"Found {len(products)} products on the page")
product_detail_list = []
for product in products:
try:
product_sku = product.find_element(By.CSS_SELECTOR, "input[id^='product_sku_']").get_attribute('value')
product_name = product.find_element(By.CSS_SELECTOR, "input[id^='product_name_']").get_attribute('value')
price_element = product.find_element(By.CSS_SELECTOR, "div[class*='price']")
price = price_element.text if price_element else "Price not found"
img_element = product.find_element(By.CSS_SELECTOR, "a.product-image-url img.img-responsive")
img_url = img_element.get_attribute('src') if img_element else "Image URL not found"
product_link_element = product.find_element(By.CSS_SELECTOR, "a.product-image-url")
product_link = product_link_element.get_attribute('href') if product_link_element else "Product link not found"
product_detail_list.append({
"sku": product_sku,
"name": product_name,
"price": price,
"img_url": img_url,
"product_link": product_link
})
self.log_manager.log(f"SKU: {product_sku}, Name: {product_name}, Price: {price}, Image URL: {img_url}, Product Link: {product_link}")
except Exception as e:
self.log_manager.error(f"Error processing product: {e}")
return product_detail_list
def close(self):
self.driver.quit()
self.log_manager.info("Browser closed")
if __name__ == "__main__":
url = "https://www.costco.com/CatalogSearch?dept=All&keyword=pokemon"
monitor = CostcoMonitor(url)
monitor.get_products()
monitor.close()

View File

@ -1,12 +1,7 @@
from webhook import WebhookNotifier
from app import (
Application,
RedditMonitor,
ApiRequestHandler,
PostManager,
PostAnalyticsManager,
SubmissionManager,
)
from app import Application
from api import ApiRequestHandler, PostManager, PostAnalyticsManager
from reddit import RedditMonitor, SubmissionManager
from config import Config
from app_log import LoggingManager
@ -23,7 +18,6 @@ if __name__ == "__main__":
disable_webhook = Config.DISABLE_WEBHOOK
pkmn_env = Config.PKMN_ENV
api_url = Config.API_URL
update_frequency = Config.UPDATE_FREQUENCY
reddit_monitor = RedditMonitor(
client_id, client_secret, user_agent, username, password, subreddit_name
@ -42,7 +36,6 @@ if __name__ == "__main__":
post_manager,
post_analytics_manager,
submission_manager,
update_frequency,
)
app.run()

156
scraper/reddit.py Normal file
View File

@ -0,0 +1,156 @@
import praw
from app_log import LoggingManager
from models import Post
from api import PostManager, PostAnalyticsManager
class RedditMonitor:
"""
Monitors Reddit submissions for a specific subreddit, streaming new submissions and
updating existing ones. Utilizes PRAW (Python Reddit API Wrapper) to interact with Reddit's API.
Attributes:
reddit (praw.Reddit): An instance of the PRAW Reddit class for API interactions.
subreddit (praw.models.Subreddit): The subreddit object for the specified subreddit.
log_manager (LoggingManager): Manages logging for Reddit monitoring operations.
"""
def __init__(
self, client_id, client_secret, user_agent, username, password, subreddit_name
):
"""
Initializes the RedditMonitor with credentials for Reddit API access and the target subreddit.
Parameters:
client_id (str): The client ID for the Reddit API application.
client_secret (str): The client secret for the Reddit API application.
user_agent (str): The user agent string identifying the application to Reddit.
username (str): The Reddit account username for authentication.
password (str): The Reddit account password for authentication.
subreddit_name (str): The name of the subreddit to monitor.
"""
self.reddit = praw.Reddit(
client_id=client_id,
client_secret=client_secret,
user_agent=user_agent,
username=username,
password=password,
)
self.subreddit = self.reddit.subreddit(subreddit_name)
self.log_manager = LoggingManager("scraper.log")
def stream_submissions(self):
"""
Streams new submissions from the specified subreddit, yielding each submission
as it becomes available.
Yields:
praw.models.Submission: A submission object representing a Reddit post.
"""
self.log_manager.info("Starting submission stream")
for submission in self.subreddit.stream.submissions():
yield submission
def update_submissions(self, posts_to_update):
"""
Retrieves and yields submissions corresponding to a list of posts that need to be updated,
identified by their Reddit IDs.
Parameters:
posts_to_update (list of dict): A list of dictionaries, each containing the 'reddit_id' of a post to update.
Yields:
praw.models.Submission: A submission object for each post that needs to be updated.
"""
self.log_manager.info("Updating submissions")
for post in posts_to_update:
submission = self.reddit.submission(id=post["reddit_id"])
yield submission
class SubmissionManager:
"""
Manages the processing of Reddit submissions, including conversion to post objects,
checking for updates, and notifying via webhook. It integrates closely with RedditMonitor,
PostManager, and PostAnalyticsManager to streamline the handling of new and existing submissions.
Attributes:
reddit_monitor (RedditMonitor): Monitors and streams Reddit submissions.
post_manager (PostManager): Manages post data interactions.
post_analytics_manager (PostAnalyticsManager): Manages post analytics data.
webhook_notifier (WebhookNotifier): Handles notifications for new or updated posts.
log_manager (LoggingManager): Manages logging for submission processing operations.
"""
def __init__(
self,
reddit_monitor: RedditMonitor,
post_manager: PostManager,
post_analytics_manager: PostAnalyticsManager,
WebhookNotifier,
):
"""
Initializes the SubmissionManager with necessary components for processing submissions.
Parameters:
reddit_monitor (RedditMonitor): The component for monitoring Reddit submissions.
post_manager (PostManager): The component for managing post data.
post_analytics_manager (PostAnalyticsManager): The component for managing post analytics.
WebhookNotifier: The component for sending notifications about posts.
"""
self.reddit_monitor = reddit_monitor
self.post_manager = post_manager
self.post_analytics_manager = post_analytics_manager
self.webhook_notifier = WebhookNotifier
self.log_manager = LoggingManager("scraper.log")
def convert_submission_to_post(self, submission):
"""
Converts a Reddit submission object into a Post object suitable for database insertion
or analytics processing.
Parameters:
submission (praw.models.Submission): The Reddit submission to convert.
Returns:
Post: A Post object populated with data from the Reddit submission.
"""
post = Post(
reddit_id=submission.id,
title=submission.title,
name=submission.name,
url=submission.url,
score=submission.score,
num_comments=submission.num_comments,
created_utc=submission.created_utc,
selftext=submission.selftext,
permalink=submission.permalink,
upvote_ratio=submission.upvote_ratio,
)
return post
def process_submissions(self, submissions, update_frequency=None):
"""
Processes a stream of Reddit submissions, checking for their existence, updating analytics,
and notifying via webhook if necessary. Optionally respects an update frequency to limit updates.
Parameters:
submissions (Iterable[praw.models.Submission]): An iterable of Reddit submission objects to process.
update_frequency (int, optional): The minimum frequency in seconds to update a post's analytics.
"""
for submission in submissions:
self.log_manager.log(submission)
if self.post_manager.post_exists(submission.id):
self.log_manager.log("Post exists")
self.log_manager.log(f"post id: {submission.id}")
if self.post_analytics_manager.check_update_requirements(
submission.id, update_frequency
):
self.log_manager.log("Update requirements met")
post = self.convert_submission_to_post(submission)
self.post_analytics_manager.update_post_analytics(post)
else:
post = self.convert_submission_to_post(submission)
self.post_manager.insert_post(post)
self.post_analytics_manager.update_post_analytics(post)
self.webhook_notifier.send_notification(post)

Binary file not shown.

View File

@ -1,26 +1,53 @@
import threading
import time
class Scheduler:
def __init__(self, interval, function):
self.interval = interval
self.function = function
self.stop_event = threading.Event()
def __init__(self):
self.tasks = {}
def run(self):
while not self.stop_event.wait(self.interval):
self.function()
def add_task(self, task_name, function, interval):
"""
Adds a new task to the scheduler.
def stop(self):
self.stop_event.set()
Parameters:
task_name (str): Unique name for the task.
function (callable): The function to run for this task.
interval (int): Time in seconds between each execution of the task.
"""
task = {
"interval": interval,
"function": function,
"stop_event": threading.Event(),
"thread": threading.Thread(target=self.run_task, args=(task_name,), daemon=True)
}
self.tasks[task_name] = task
task["thread"].start()
def run_task(self, task_name):
"""
Executes the task in a loop until its stop event is set.
class ThreadManager:
def __init__(self, target, args: tuple = ()) -> None:
self.target = target
self.args = args
Parameters:
task_name (str): The name of the task to run.
"""
task = self.tasks[task_name]
while not task["stop_event"].is_set():
task["function"]()
task["stop_event"].wait(task["interval"])
def run(self):
thread = threading.Thread(target=self.target, args=self.args)
thread.daemon = True
thread.start()
def stop_task(self, task_name):
"""
Stops the specified task.
Parameters:
task_name (str): The name of the task to stop.
"""
if task_name in self.tasks:
self.tasks[task_name]["stop_event"].set()
def stop_all_tasks(self):
"""
Stops all tasks managed by the scheduler.
"""
for task_name in self.tasks.keys():
self.stop_task(task_name)

View File

@ -25,3 +25,24 @@ class WebhookNotifier:
requests.post(self.webhook_url, data={"content": content})
except Exception as e:
self.log_manager.error(f"Failed to send notification: {e}")
def costco_notification(self, data):
for product in data:
sku = product.get("sku")
name = product.get("name")
price = product.get("price")
img_url = product.get("img_url")
product_link = product.get("product_link")
content = f"""
**Costco has a new item!**
**Name:** {name}
**Price:** {price}
**Link:** {product_link}
{img_url}"""
if not self.disable_webhook:
self.log_manager.log(f"Sending notification to {self.webhook_url}")
try:
requests.post(self.webhook_url, data={"content": content})
except Exception as e:
self.log_manager.error(f"Failed to send notification: {e}")