finish scraper django integration
This commit is contained in:
parent
c16bbb5275
commit
9a666df52c
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,4 +1,5 @@
|
|||||||
*.db
|
*.db
|
||||||
__pycache__
|
__pycache__
|
||||||
.venv
|
.venv
|
||||||
*.sqlite3
|
*.sqlite3
|
||||||
|
*.log
|
306
scraper/app.py
306
scraper/app.py
@ -1,103 +1,247 @@
|
|||||||
import threading
|
from datetime import datetime, timedelta
|
||||||
import time
|
|
||||||
from datetime import datetime
|
|
||||||
import requests
|
import requests
|
||||||
from models import Submission
|
from models import Post
|
||||||
import logging
|
import praw
|
||||||
|
from zoneinfo import ZoneInfo
|
||||||
|
from exceptions import InvalidMethodError, InvalidDataTypeError, APIRequestError
|
||||||
|
from app_log import LoggingManager
|
||||||
|
from threads import Scheduler, ThreadManager
|
||||||
|
|
||||||
# logging
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
|
|
||||||
class Application:
|
class ApiRequestHandler:
|
||||||
def __init__(self, reddit_monitor, webhook_notifier, api_url):
|
def __init__(self, api_url: str):
|
||||||
self.reddit_monitor = reddit_monitor
|
|
||||||
self.webhook_notifier = webhook_notifier
|
|
||||||
self.api_url = api_url
|
self.api_url = api_url
|
||||||
|
self.log_manager = LoggingManager("scraper.log")
|
||||||
def send_api_request(self, method, url, data=None, params=None):
|
|
||||||
response = requests.request(method, url, data=data, params=params)
|
def send_api_request(
|
||||||
|
self, method: str, api_url: str, data=None, params=None
|
||||||
|
) -> dict:
|
||||||
|
if method not in ["GET", "POST", "PUT", "DELETE"]:
|
||||||
|
raise InvalidMethodError(f"Invalid method: {method}")
|
||||||
|
if data is not None and not isinstance(data, dict):
|
||||||
|
raise InvalidDataTypeError(f"Invalid data type: {type(data)} expected dict")
|
||||||
|
if params is not None and not isinstance(params, dict):
|
||||||
|
raise InvalidDataTypeError(
|
||||||
|
f"Invalid data type: {type(params)} expected dict"
|
||||||
|
)
|
||||||
|
response = requests.request(method, api_url, data=data, params=params)
|
||||||
|
success_codes = [200, 201, 204]
|
||||||
|
if response.status_code not in success_codes:
|
||||||
|
self.log_manager.error(
|
||||||
|
f"API request failed: {response.status_code} - {response.text}"
|
||||||
|
)
|
||||||
|
raise APIRequestError(response.status_code, response.text)
|
||||||
return response.json()
|
return response.json()
|
||||||
|
|
||||||
def get_submission_by_reddit_id(self, reddit_id):
|
|
||||||
logging.info(f"Getting submission by reddit_id: {reddit_id}")
|
class PostManager:
|
||||||
logging.info(f"{self.api_url}submissions/?reddit_id={reddit_id}")
|
def __init__(self, api_request_handler: ApiRequestHandler):
|
||||||
response = self.send_api_request("GET", f"{self.api_url}submissions/?reddit_id={reddit_id}")
|
self.api_request_handler = api_request_handler
|
||||||
logging.info(response)
|
self.log_manager = LoggingManager("scraper.log")
|
||||||
|
|
||||||
|
def get_post_by_reddit_id(self, reddit_id: str) -> dict:
|
||||||
|
self.log_manager.log(f"Getting post by reddit id: {reddit_id}")
|
||||||
|
response = self.api_request_handler.send_api_request(
|
||||||
|
"GET", f"{self.api_request_handler.api_url}posts/?reddit_id={reddit_id}"
|
||||||
|
)
|
||||||
return response
|
return response
|
||||||
|
|
||||||
def submission_exists(self, reddit_id):
|
def post_exists(self, reddit_id: str) -> bool:
|
||||||
response = self.get_submission_by_reddit_id(reddit_id)
|
self.log_manager.log(f"Checking if post exists: {reddit_id}")
|
||||||
|
response = self.get_post_by_reddit_id(reddit_id)
|
||||||
if len(response) == 0:
|
if len(response) == 0:
|
||||||
logging.info(f"Submission {reddit_id} does not exist")
|
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def update_submission_analytics(self, submission):
|
def insert_post(self, post) -> dict:
|
||||||
submission_id = self.get_submission_by_reddit_id(submission.reddit_id)
|
self.log_manager.log(f"Inserting post: {post.reddit_id}")
|
||||||
logging.info(submission_id)
|
self.post = post
|
||||||
submission_id = submission_id[0]["id"]
|
|
||||||
data = {
|
data = {
|
||||||
"id": submission_id,
|
"reddit_id": self.post.reddit_id,
|
||||||
"score": submission.score,
|
"title": self.post.title,
|
||||||
"num_comments": submission.num_comments,
|
"name": self.post.name,
|
||||||
|
"url": self.post.url,
|
||||||
|
"created_utc": self.post.created_utc,
|
||||||
|
"selftext": self.post.selftext,
|
||||||
|
"permalink": self.post.permalink,
|
||||||
}
|
}
|
||||||
self.send_api_request("PATCH", f"{self.api_url}submissions/{submission_id}/", data=data)
|
response = self.api_request_handler.send_api_request(
|
||||||
|
"POST", f"{self.api_request_handler.api_url}posts/", data=data
|
||||||
def get_submissions_to_update(self):
|
)
|
||||||
submissions_to_update = self.send_api_request("GET", f"{self.api_url}submissions/?last_7_days=1")
|
return response
|
||||||
return submissions_to_update
|
|
||||||
|
def get_posts_from_last_7_days(self) -> dict:
|
||||||
def insert_submission(self, submission):
|
self.log_manager.log("Getting posts from last 7 days")
|
||||||
|
posts_from_last_7_days = self.api_request_handler.send_api_request(
|
||||||
|
"GET", f"{self.api_request_handler.api_url}posts/?last_7_days=1"
|
||||||
|
)
|
||||||
|
return posts_from_last_7_days
|
||||||
|
|
||||||
|
|
||||||
|
class PostAnalyticsManager:
|
||||||
|
def __init__(
|
||||||
|
self, api_request_handler: ApiRequestHandler, post_manager: PostManager
|
||||||
|
):
|
||||||
|
self.api_request_handler = api_request_handler
|
||||||
|
self.post_manager = post_manager
|
||||||
|
self.log_manager = LoggingManager("scraper.log")
|
||||||
|
|
||||||
|
def check_update_requirements(self, reddit_id: str) -> bool:
|
||||||
|
self.log_manager.log(f"Checking update requirements for {reddit_id}")
|
||||||
|
|
||||||
|
# Specify your desired timezone, e.g., UTC
|
||||||
|
timezone = ZoneInfo("UTC")
|
||||||
|
|
||||||
|
# Make your datetime objects timezone-aware
|
||||||
|
fifteen_minutes_ago = datetime.now(timezone) - timedelta(minutes=15)
|
||||||
|
now = datetime.now(timezone)
|
||||||
|
|
||||||
|
# Format datetime objects for the API request
|
||||||
|
time_begin_str = fifteen_minutes_ago.isoformat(timespec="seconds")
|
||||||
|
time_end_str = now.isoformat(timespec="seconds")
|
||||||
|
|
||||||
|
post_id = self.post_manager.get_post_by_reddit_id(reddit_id)
|
||||||
|
post_id = post_id[0]["id"]
|
||||||
|
self.log_manager.log(
|
||||||
|
f"{self.api_request_handler.api_url}post_analytics/?post={post_id}&time_begin={time_begin_str}&time_end={time_end_str}"
|
||||||
|
)
|
||||||
|
|
||||||
|
response = self.api_request_handler.send_api_request(
|
||||||
|
"GET",
|
||||||
|
f"{self.api_request_handler.api_url}post_analytics/?post={post_id}&time_begin={time_begin_str}&time_end={time_end_str}",
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(response) > 0:
|
||||||
|
# post should not be updated
|
||||||
|
return False
|
||||||
|
|
||||||
|
# post should be updated
|
||||||
|
return True
|
||||||
|
|
||||||
|
def update_post_analytics(self, post: Post) -> dict:
|
||||||
|
self.log_manager.log(f"Updating post analytics for {post.reddit_id}")
|
||||||
|
post_id = self.post_manager.get_post_by_reddit_id(post.reddit_id)
|
||||||
|
post_id = post_id[0]["id"]
|
||||||
data = {
|
data = {
|
||||||
"reddit_id": submission.reddit_id,
|
"post": post_id,
|
||||||
"title": submission.title,
|
"score": post.score,
|
||||||
"name": submission.name,
|
"num_comments": post.num_comments,
|
||||||
"url": submission.url,
|
"upvote_ratio": post.upvote_ratio,
|
||||||
"created_utc": submission.created_utc,
|
|
||||||
"selftext": submission.selftext,
|
|
||||||
"permalink": submission.permalink,
|
|
||||||
"upvote_ratio": submission.upvote_ratio,
|
|
||||||
}
|
}
|
||||||
response = self.send_api_request("POST", f"{self.api_url}submissions/", data=data)
|
response = self.api_request_handler.send_api_request(
|
||||||
logging.info("Inserting submission")
|
"POST", f"{self.api_request_handler.api_url}post_analytics/", data=data
|
||||||
logging.info(response)
|
)
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
class RedditMonitor:
|
||||||
|
def __init__(
|
||||||
|
self, client_id, client_secret, user_agent, username, password, subreddit_name
|
||||||
|
):
|
||||||
|
self.reddit = praw.Reddit(
|
||||||
|
client_id=client_id,
|
||||||
|
client_secret=client_secret,
|
||||||
|
user_agent=user_agent,
|
||||||
|
username=username,
|
||||||
|
password=password,
|
||||||
|
)
|
||||||
|
self.subreddit = self.reddit.subreddit(subreddit_name)
|
||||||
|
self.log_manager = LoggingManager("scraper.log")
|
||||||
|
|
||||||
|
def stream_submissions(self):
|
||||||
|
self.log_manager.info("Starting submission stream")
|
||||||
|
for submission in self.subreddit.stream.submissions():
|
||||||
|
yield submission
|
||||||
|
|
||||||
|
def update_submissions(self, posts_to_update):
|
||||||
|
self.log_manager.info("Updating submissions")
|
||||||
|
for post in posts_to_update:
|
||||||
|
submission = self.reddit.submission(id=post["reddit_id"])
|
||||||
|
yield submission
|
||||||
|
|
||||||
|
|
||||||
|
class SubmissionManager:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
reddit_monitor: RedditMonitor,
|
||||||
|
post_manager: PostManager,
|
||||||
|
post_analytics_manager: PostAnalyticsManager,
|
||||||
|
WebhookNotifier,
|
||||||
|
):
|
||||||
|
self.reddit_monitor = reddit_monitor
|
||||||
|
self.post_manager = post_manager
|
||||||
|
self.post_analytics_manager = post_analytics_manager
|
||||||
|
self.webhook_notifier = WebhookNotifier
|
||||||
|
self.log_manager = LoggingManager("scraper.log")
|
||||||
|
|
||||||
|
def convert_submission_to_post(self, submission):
|
||||||
|
post = Post(
|
||||||
|
reddit_id=submission.id,
|
||||||
|
title=submission.title,
|
||||||
|
name=submission.name,
|
||||||
|
url=submission.url,
|
||||||
|
score=submission.score,
|
||||||
|
num_comments=submission.num_comments,
|
||||||
|
created_utc=submission.created_utc,
|
||||||
|
selftext=submission.selftext,
|
||||||
|
permalink=submission.permalink,
|
||||||
|
upvote_ratio=submission.upvote_ratio,
|
||||||
|
)
|
||||||
|
return post
|
||||||
|
|
||||||
def process_submissions(self, submissions):
|
def process_submissions(self, submissions):
|
||||||
for submission in submissions:
|
for submission in submissions:
|
||||||
submission = Submission(
|
self.log_manager.log(submission)
|
||||||
reddit_id=submission.id,
|
if self.post_manager.post_exists(submission.id):
|
||||||
title=submission.title,
|
self.log_manager.log("Post exists")
|
||||||
name=submission.name,
|
self.log_manager.log(f"post id: {submission.id}")
|
||||||
url=submission.url,
|
if self.post_analytics_manager.check_update_requirements(submission.id):
|
||||||
score=submission.score,
|
self.log_manager.log("Update requirements met")
|
||||||
num_comments=submission.num_comments,
|
post = self.convert_submission_to_post(submission)
|
||||||
created_utc=submission.created_utc,
|
self.post_analytics_manager.update_post_analytics(post)
|
||||||
selftext=submission.selftext,
|
|
||||||
permalink=submission.permalink,
|
|
||||||
upvote_ratio=submission.upvote_ratio
|
|
||||||
)
|
|
||||||
if self.submission_exists(submission.reddit_id):
|
|
||||||
self.update_submission_analytics(submission)
|
|
||||||
else:
|
else:
|
||||||
self.insert_submission(submission)
|
post = self.convert_submission_to_post(submission)
|
||||||
self.update_submission_analytics(submission)
|
self.post_manager.insert_post(post)
|
||||||
self.webhook_notifier.send_notification(submission)
|
self.post_analytics_manager.update_post_analytics(post)
|
||||||
|
self.webhook_notifier.send_notification(post)
|
||||||
|
|
||||||
|
|
||||||
|
class Application:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
reddit_monitor,
|
||||||
|
webhook_notifier,
|
||||||
|
api_conn,
|
||||||
|
post_manager,
|
||||||
|
post_analytics_manager,
|
||||||
|
submission_manager,
|
||||||
|
):
|
||||||
|
self.reddit_monitor = reddit_monitor
|
||||||
|
self.webhook_notifier = webhook_notifier
|
||||||
|
self.api_conn = api_conn
|
||||||
|
self.post_manager = post_manager
|
||||||
|
self.post_analytics_manager = post_analytics_manager
|
||||||
|
self.log_manager = LoggingManager("scraper.log")
|
||||||
|
self.submission_manager = submission_manager
|
||||||
|
self.scheduler = None
|
||||||
|
self.thread_manager = None
|
||||||
|
|
||||||
def periodic_update(self):
|
def periodic_update(self):
|
||||||
to_be_updated = self.get_submissions_to_update()
|
self.log_manager.info("Running periodic update")
|
||||||
|
to_be_updated = self.post_manager.get_posts_from_last_7_days()
|
||||||
submissions = self.reddit_monitor.update_submissions(to_be_updated)
|
submissions = self.reddit_monitor.update_submissions(to_be_updated)
|
||||||
self.process_submissions(submissions)
|
self.submission_manager.process_submissions(submissions)
|
||||||
|
|
||||||
def run_periodic_update(self, interval=3600):
|
def run_periodic_update(self, interval):
|
||||||
while True:
|
self.scheduler = Scheduler(interval, self.periodic_update)
|
||||||
self.periodic_update()
|
self.scheduler.run()
|
||||||
print(f"Existing posts Updated at {datetime.now()}")
|
|
||||||
time.sleep(interval)
|
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
#update_frequency = 3600 # 3600
|
self.log_manager.info("Application started")
|
||||||
#update_thread = threading.Thread(target=self.run_periodic_update, args=(update_frequency, ))
|
update_frequency = 60 * 15 # 15 minutes in seconds
|
||||||
#update_thread.daemon = True
|
self.thread_manager = ThreadManager(
|
||||||
#update_thread.start()
|
target=self.run_periodic_update, args=(update_frequency,)
|
||||||
|
)
|
||||||
|
self.thread_manager.run()
|
||||||
submissions = self.reddit_monitor.stream_submissions()
|
submissions = self.reddit_monitor.stream_submissions()
|
||||||
self.process_submissions(submissions)
|
self.submission_manager.process_submissions(submissions)
|
||||||
|
46
scraper/app_log.py
Normal file
46
scraper/app_log.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
import logging
|
||||||
|
from logging.handlers import RotatingFileHandler
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
class SingletonMeta(type):
|
||||||
|
_instances = {}
|
||||||
|
|
||||||
|
def __call__(cls, *args, **kwargs):
|
||||||
|
if cls not in cls._instances:
|
||||||
|
cls._instances[cls] = super(SingletonMeta, cls).__call__(*args, **kwargs)
|
||||||
|
return cls._instances[cls]
|
||||||
|
|
||||||
|
|
||||||
|
class LoggingManager(metaclass=SingletonMeta):
|
||||||
|
def __init__(self, log_file):
|
||||||
|
if not hasattr(self, "logger"):
|
||||||
|
self.log_file = log_file
|
||||||
|
self.logger = logging.getLogger("scraper")
|
||||||
|
self.logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
file_handler = RotatingFileHandler(
|
||||||
|
self.log_file, maxBytes=1024 * 1024 * 5, backupCount=5
|
||||||
|
)
|
||||||
|
file_handler.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
stream_handler = logging.StreamHandler(sys.stdout)
|
||||||
|
stream_handler.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
formatter = logging.Formatter(
|
||||||
|
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||||
|
)
|
||||||
|
file_handler.setFormatter(formatter)
|
||||||
|
stream_handler.setFormatter(formatter)
|
||||||
|
|
||||||
|
self.logger.addHandler(file_handler)
|
||||||
|
self.logger.addHandler(stream_handler)
|
||||||
|
|
||||||
|
def log(self, message):
|
||||||
|
self.logger.debug(message)
|
||||||
|
|
||||||
|
def error(self, message):
|
||||||
|
self.logger.error(message)
|
||||||
|
|
||||||
|
def info(self, message):
|
||||||
|
self.logger.info(message)
|
@ -7,8 +7,8 @@ class Config:
|
|||||||
PRAW_USERNAME = os.getenv("PRAW_USERNAME")
|
PRAW_USERNAME = os.getenv("PRAW_USERNAME")
|
||||||
PRAW_PASSWORD = os.getenv("PRAW_PASSWORD")
|
PRAW_PASSWORD = os.getenv("PRAW_PASSWORD")
|
||||||
POKEMANS_WEBHOOK_URL = os.getenv("POKEMANS_WEBHOOK_URL")
|
POKEMANS_WEBHOOK_URL = os.getenv("POKEMANS_WEBHOOK_URL")
|
||||||
PKMN_ENV = 'dev' # os.getenv("PKMN_ENV")
|
PKMN_ENV = "dev" # os.getenv("PKMN_ENV")
|
||||||
SUBREDDIT_NAME = "pkmntcgdeals"
|
SUBREDDIT_NAME = "pkmntcgdeals"
|
||||||
USER_AGENT = "praw:zman.video_repost_bot:v0.1.0 (by u/jzman21)"
|
USER_AGENT = "praw:zman.video_repost_bot:v0.1.0 (by u/jzman21)"
|
||||||
DISABLE_WEBHOOK = False
|
DISABLE_WEBHOOK = False
|
||||||
API_URL = "http://server:8000/api/"
|
API_URL = "http://server:8000/api/"
|
||||||
|
19
scraper/exceptions.py
Normal file
19
scraper/exceptions.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
class InvalidMethodError(Exception):
|
||||||
|
"""Exception raised for unsupported HTTP methods."""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class InvalidDataTypeError(Exception):
|
||||||
|
"""Exception raised for unsupported data types."""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class APIRequestError(Exception):
|
||||||
|
"""Exception raised for API request errors."""
|
||||||
|
|
||||||
|
def __init__(self, status_code, message):
|
||||||
|
self.status_code = status_code
|
||||||
|
self.message = message
|
||||||
|
super().__init__(f"API Request Failed: {status_code} - {message}")
|
@ -1,11 +1,18 @@
|
|||||||
from reddit_monitor import RedditMonitor
|
|
||||||
from webhook import WebhookNotifier
|
from webhook import WebhookNotifier
|
||||||
from app import Application
|
from app import (
|
||||||
|
Application,
|
||||||
|
RedditMonitor,
|
||||||
|
ApiRequestHandler,
|
||||||
|
PostManager,
|
||||||
|
PostAnalyticsManager,
|
||||||
|
SubmissionManager,
|
||||||
|
)
|
||||||
from config import Config
|
from config import Config
|
||||||
import logging
|
from app_log import LoggingManager
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
log_manager = LoggingManager("scraper.log")
|
||||||
client_id = Config.PRAW_CLIENT_ID
|
client_id = Config.PRAW_CLIENT_ID
|
||||||
client_secret = Config.PRAW_CLIENT_SECRET
|
client_secret = Config.PRAW_CLIENT_SECRET
|
||||||
user_agent = Config.USER_AGENT
|
user_agent = Config.USER_AGENT
|
||||||
@ -17,21 +24,28 @@ if __name__ == "__main__":
|
|||||||
pkmn_env = Config.PKMN_ENV
|
pkmn_env = Config.PKMN_ENV
|
||||||
api_url = Config.API_URL
|
api_url = Config.API_URL
|
||||||
|
|
||||||
# logging
|
reddit_monitor = RedditMonitor(
|
||||||
logging.basicConfig(filename='scraper.log', level=logging.DEBUG)
|
client_id, client_secret, user_agent, username, password, subreddit_name
|
||||||
logging.info('Starting scraper')
|
)
|
||||||
|
|
||||||
reddit_monitor = RedditMonitor(client_id, client_secret, user_agent, username, password, subreddit_name)
|
|
||||||
webhook_notifier = WebhookNotifier(discord_webhook_url, disable_webhook)
|
webhook_notifier = WebhookNotifier(discord_webhook_url, disable_webhook)
|
||||||
app = Application(reddit_monitor, webhook_notifier, api_url)
|
api_conn = ApiRequestHandler(api_url)
|
||||||
|
post_manager = PostManager(api_conn)
|
||||||
|
post_analytics_manager = PostAnalyticsManager(api_conn, post_manager)
|
||||||
|
submission_manager = SubmissionManager(
|
||||||
|
reddit_monitor, post_manager, post_analytics_manager, webhook_notifier
|
||||||
|
)
|
||||||
|
app = Application(
|
||||||
|
reddit_monitor,
|
||||||
|
webhook_notifier,
|
||||||
|
api_conn,
|
||||||
|
post_manager,
|
||||||
|
post_analytics_manager,
|
||||||
|
submission_manager,
|
||||||
|
)
|
||||||
app.run()
|
app.run()
|
||||||
|
|
||||||
"""
|
"""
|
||||||
TODO:
|
TODO:
|
||||||
- django rest framework
|
|
||||||
- api for managing database
|
|
||||||
- remove scraper models
|
|
||||||
- connect scraper to django rest framework api
|
|
||||||
- pull upvote ration into analytics?
|
- pull upvote ration into analytics?
|
||||||
- sqlite vs postgres figure out
|
- sqlite vs postgres figure out
|
||||||
- basic front end (react)
|
- basic front end (react)
|
||||||
@ -44,4 +58,4 @@ TODO:
|
|||||||
- try to identify platform ie. costco for gift card, tiktok for coupons, etc.
|
- try to identify platform ie. costco for gift card, tiktok for coupons, etc.
|
||||||
- support for craigslist, ebay, etc.
|
- support for craigslist, ebay, etc.
|
||||||
- front end - vizualization, classification, lookup, etc.
|
- front end - vizualization, classification, lookup, etc.
|
||||||
"""
|
"""
|
||||||
|
@ -1,5 +1,17 @@
|
|||||||
class Submission():
|
class Post:
|
||||||
def __init__(self, reddit_id, title, name, url, score, num_comments, created_utc, selftext, permalink, upvote_ratio):
|
def __init__(
|
||||||
|
self,
|
||||||
|
reddit_id,
|
||||||
|
title,
|
||||||
|
name,
|
||||||
|
url,
|
||||||
|
score,
|
||||||
|
num_comments,
|
||||||
|
created_utc,
|
||||||
|
selftext,
|
||||||
|
permalink,
|
||||||
|
upvote_ratio,
|
||||||
|
):
|
||||||
self.reddit_id = reddit_id
|
self.reddit_id = reddit_id
|
||||||
self.title = title
|
self.title = title
|
||||||
self.name = name
|
self.name = name
|
||||||
@ -10,6 +22,6 @@ class Submission():
|
|||||||
self.selftext = selftext
|
self.selftext = selftext
|
||||||
self.permalink = permalink
|
self.permalink = permalink
|
||||||
self.upvote_ratio = upvote_ratio
|
self.upvote_ratio = upvote_ratio
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f"{self.reddit_id} {self.title} {self.name} {self.url} {self.score} {self.num_comments} {self.created_utc} {self.selftext} {self.permalink} {self.upvote_ratio}"
|
return f"{self.reddit_id} {self.title} {self.name} {self.url} {self.score} {self.num_comments} {self.created_utc} {self.selftext} {self.permalink} {self.upvote_ratio}"
|
||||||
|
@ -1,23 +0,0 @@
|
|||||||
import praw
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
|
|
||||||
|
|
||||||
class RedditMonitor:
|
|
||||||
def __init__(self, client_id, client_secret, user_agent, username, password, subreddit_name):
|
|
||||||
self.reddit = praw.Reddit(
|
|
||||||
client_id=client_id,
|
|
||||||
client_secret=client_secret,
|
|
||||||
user_agent=user_agent,
|
|
||||||
username=username,
|
|
||||||
password=password
|
|
||||||
)
|
|
||||||
self.subreddit = self.reddit.subreddit(subreddit_name)
|
|
||||||
|
|
||||||
def stream_submissions(self):
|
|
||||||
for submission in self.subreddit.stream.submissions():
|
|
||||||
yield submission
|
|
||||||
|
|
||||||
def update_submissions(self, submissions_to_update):
|
|
||||||
for submission in submissions_to_update:
|
|
||||||
praw_submission = self.reddit.submission(id=submission['reddit_id'])
|
|
||||||
yield praw_submission
|
|
26
scraper/threads.py
Normal file
26
scraper/threads.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
import threading
|
||||||
|
|
||||||
|
|
||||||
|
class Scheduler:
|
||||||
|
def __init__(self, interval, function):
|
||||||
|
self.interval = interval
|
||||||
|
self.function = function
|
||||||
|
self.stop_event = threading.Event()
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
while not self.stop_event.wait(self.interval):
|
||||||
|
self.function()
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
self.stop_event.set()
|
||||||
|
|
||||||
|
|
||||||
|
class ThreadManager:
|
||||||
|
def __init__(self, target, args: tuple = ()) -> None:
|
||||||
|
self.target = target
|
||||||
|
self.args = args
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
thread = threading.Thread(target=self.target, args=self.args)
|
||||||
|
thread.daemon = True
|
||||||
|
thread.start()
|
@ -1,21 +1,27 @@
|
|||||||
import requests
|
import requests
|
||||||
|
from app_log import LoggingManager
|
||||||
|
|
||||||
|
|
||||||
class WebhookNotifier:
|
class WebhookNotifier:
|
||||||
def __init__(self, webhook_url, disable_webhook=False):
|
def __init__(self, webhook_url, disable_webhook=False):
|
||||||
self.webhook_url = webhook_url
|
self.webhook_url = webhook_url
|
||||||
self.disable_webhook = disable_webhook
|
self.disable_webhook = disable_webhook
|
||||||
|
self.log_manager = LoggingManager("scraper.log")
|
||||||
|
|
||||||
def send_notification(self, submission):
|
def send_notification(self, post):
|
||||||
title = submission.title
|
title = post.title
|
||||||
url = submission.url
|
url = post.url
|
||||||
permalink = submission.permalink
|
permalink = post.permalink
|
||||||
selftext = submission.selftext
|
selftext = post.selftext
|
||||||
content = f"""
|
content = f"""
|
||||||
**New Deal!**
|
**New Deal!**
|
||||||
**Title:** {title}
|
**Title:** {title}
|
||||||
**URL:** {url}
|
**URL:** {url}
|
||||||
**Permalink:** https://old.reddit.com{permalink}
|
**Permalink:** https://old.reddit.com{permalink}
|
||||||
**Selftext:** {selftext}"""
|
**Selftext:** {selftext}"""
|
||||||
if not self.disable_webhook:
|
if not self.disable_webhook:
|
||||||
requests.post(self.webhook_url, data={"content": content})
|
self.log_manager.log(f"Sending notification to {self.webhook_url}")
|
||||||
|
try:
|
||||||
|
requests.post(self.webhook_url, data={"content": content})
|
||||||
|
except Exception as e:
|
||||||
|
self.log_manager.error(f"Failed to send notification: {e}")
|
@ -1,4 +1,4 @@
|
|||||||
# Generated by Django 5.0.2 on 2024-03-04 01:40
|
# Generated by Django 5.0.2 on 2024-03-04 05:15
|
||||||
|
|
||||||
import django.db.models.deletion
|
import django.db.models.deletion
|
||||||
from django.db import migrations, models
|
from django.db import migrations, models
|
||||||
@ -8,38 +8,33 @@ class Migration(migrations.Migration):
|
|||||||
|
|
||||||
initial = True
|
initial = True
|
||||||
|
|
||||||
dependencies = []
|
dependencies = [
|
||||||
|
]
|
||||||
|
|
||||||
operations = [
|
operations = [
|
||||||
migrations.CreateModel(
|
migrations.CreateModel(
|
||||||
name="Submission",
|
name='Post',
|
||||||
fields=[
|
fields=[
|
||||||
("id", models.AutoField(primary_key=True, serialize=False)),
|
('id', models.AutoField(primary_key=True, serialize=False)),
|
||||||
("reddit_id", models.CharField(max_length=255, unique=True)),
|
('reddit_id', models.CharField(max_length=255, unique=True)),
|
||||||
("title", models.CharField(max_length=255)),
|
('title', models.CharField(max_length=255)),
|
||||||
("name", models.CharField(max_length=255)),
|
('name', models.CharField(max_length=255)),
|
||||||
("url", models.CharField(max_length=255)),
|
('url', models.CharField(max_length=555)),
|
||||||
("created_utc", models.FloatField()),
|
('created_utc', models.FloatField()),
|
||||||
("selftext", models.CharField(max_length=255)),
|
('selftext', models.CharField(blank=True, max_length=2555, null=True)),
|
||||||
("permalink", models.CharField(max_length=255)),
|
('permalink', models.CharField(max_length=255)),
|
||||||
("upvote_ratio", models.FloatField()),
|
('updated_at', models.DateTimeField(auto_now=True)),
|
||||||
("updated_at", models.DateTimeField(auto_now=True)),
|
|
||||||
],
|
],
|
||||||
),
|
),
|
||||||
migrations.CreateModel(
|
migrations.CreateModel(
|
||||||
name="SubmissionAnalytics",
|
name='PostAnalytics',
|
||||||
fields=[
|
fields=[
|
||||||
("id", models.AutoField(primary_key=True, serialize=False)),
|
('id', models.AutoField(primary_key=True, serialize=False)),
|
||||||
("num_comments", models.IntegerField()),
|
('num_comments', models.IntegerField()),
|
||||||
("score", models.IntegerField()),
|
('score', models.IntegerField()),
|
||||||
("created_at", models.DateTimeField(auto_now=True)),
|
('upvote_ratio', models.FloatField()),
|
||||||
(
|
('created_at', models.DateTimeField(auto_now=True)),
|
||||||
"submission",
|
('post', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='pokemans_app.post')),
|
||||||
models.ForeignKey(
|
|
||||||
on_delete=django.db.models.deletion.CASCADE,
|
|
||||||
to="pokemans_app.submission",
|
|
||||||
),
|
|
||||||
),
|
|
||||||
],
|
],
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
@ -1,18 +0,0 @@
|
|||||||
# Generated by Django 5.0.2 on 2024-03-04 03:51
|
|
||||||
|
|
||||||
from django.db import migrations, models
|
|
||||||
|
|
||||||
|
|
||||||
class Migration(migrations.Migration):
|
|
||||||
|
|
||||||
dependencies = [
|
|
||||||
("pokemans_app", "0001_initial"),
|
|
||||||
]
|
|
||||||
|
|
||||||
operations = [
|
|
||||||
migrations.AlterField(
|
|
||||||
model_name="submission",
|
|
||||||
name="selftext",
|
|
||||||
field=models.CharField(blank=True, max_length=1234),
|
|
||||||
),
|
|
||||||
]
|
|
@ -1,22 +1,22 @@
|
|||||||
from django.db import models
|
from django.db import models
|
||||||
|
|
||||||
|
|
||||||
class Submission(models.Model):
|
class Post(models.Model):
|
||||||
id = models.AutoField(primary_key=True)
|
id = models.AutoField(primary_key=True)
|
||||||
reddit_id = models.CharField(max_length=255, unique=True)
|
reddit_id = models.CharField(max_length=255, unique=True)
|
||||||
title = models.CharField(max_length=255)
|
title = models.CharField(max_length=255)
|
||||||
name = models.CharField(max_length=255)
|
name = models.CharField(max_length=255)
|
||||||
url = models.CharField(max_length=255)
|
url = models.CharField(max_length=555)
|
||||||
created_utc = models.FloatField()
|
created_utc = models.FloatField()
|
||||||
selftext = models.CharField(max_length=1234, blank=True)
|
selftext = models.CharField(max_length=2555, blank=True, null=True)
|
||||||
permalink = models.CharField(max_length=255)
|
permalink = models.CharField(max_length=255)
|
||||||
upvote_ratio = models.FloatField()
|
|
||||||
updated_at = models.DateTimeField(auto_now=True)
|
updated_at = models.DateTimeField(auto_now=True)
|
||||||
|
|
||||||
|
|
||||||
class SubmissionAnalytics(models.Model):
|
class PostAnalytics(models.Model):
|
||||||
id = models.AutoField(primary_key=True)
|
id = models.AutoField(primary_key=True)
|
||||||
submission = models.ForeignKey(Submission, on_delete=models.CASCADE)
|
post = models.ForeignKey(Post, on_delete=models.CASCADE)
|
||||||
num_comments = models.IntegerField()
|
num_comments = models.IntegerField()
|
||||||
score = models.IntegerField()
|
score = models.IntegerField()
|
||||||
|
upvote_ratio = models.FloatField()
|
||||||
created_at = models.DateTimeField(auto_now=True)
|
created_at = models.DateTimeField(auto_now=True)
|
@ -1,13 +1,13 @@
|
|||||||
from rest_framework import serializers
|
from rest_framework import serializers
|
||||||
from .models import Submission, SubmissionAnalytics
|
from .models import Post, PostAnalytics
|
||||||
|
|
||||||
|
|
||||||
class SubmissionSerializer(serializers.ModelSerializer):
|
class PostSerializer(serializers.ModelSerializer):
|
||||||
class Meta:
|
class Meta:
|
||||||
model = Submission
|
model = Post
|
||||||
fields = '__all__'
|
fields = '__all__'
|
||||||
|
|
||||||
class SubmissionAnalyticsSerializer(serializers.ModelSerializer):
|
class PostAnalyticsSerializer(serializers.ModelSerializer):
|
||||||
class Meta:
|
class Meta:
|
||||||
model = SubmissionAnalytics
|
model = PostAnalytics
|
||||||
fields = '__all__'
|
fields = '__all__'
|
@ -1,17 +1,18 @@
|
|||||||
from django.shortcuts import render
|
from django.shortcuts import render
|
||||||
from rest_framework import viewsets
|
from rest_framework import viewsets
|
||||||
from .models import Submission, SubmissionAnalytics
|
from .models import Post, PostAnalytics
|
||||||
from .serializers import SubmissionSerializer, SubmissionAnalyticsSerializer
|
from .serializers import PostSerializer, PostAnalyticsSerializer
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
from django.utils.dateparse import parse_datetime
|
||||||
|
|
||||||
|
|
||||||
class SubmissionViewSet(viewsets.ModelViewSet):
|
class PostViewSet(viewsets.ModelViewSet):
|
||||||
queryset = Submission.objects.all()
|
queryset = Post.objects.all()
|
||||||
serializer_class = SubmissionSerializer
|
serializer_class = PostSerializer
|
||||||
|
|
||||||
def get_queryset(self):
|
def get_queryset(self):
|
||||||
queryset = Submission.objects.all()
|
queryset = Post.objects.all()
|
||||||
reddit_id = self.request.query_params.get('reddit_id', None)
|
reddit_id = self.request.query_params.get('reddit_id', None)
|
||||||
last_7_days = self.request.query_params.get('last_7_days', None)
|
last_7_days = self.request.query_params.get('last_7_days', None)
|
||||||
|
|
||||||
@ -27,6 +28,30 @@ class SubmissionViewSet(viewsets.ModelViewSet):
|
|||||||
|
|
||||||
return queryset
|
return queryset
|
||||||
|
|
||||||
class SubmissionAnalyticsViewSet(viewsets.ModelViewSet):
|
class PostAnalyticsViewSet(viewsets.ModelViewSet):
|
||||||
queryset = SubmissionAnalytics.objects.all()
|
queryset = PostAnalytics.objects.all()
|
||||||
serializer_class = SubmissionAnalyticsSerializer
|
serializer_class = PostAnalyticsSerializer
|
||||||
|
|
||||||
|
def get_queryset(self):
|
||||||
|
queryset = PostAnalytics.objects.all()
|
||||||
|
post_id = self.request.query_params.get('post', None)
|
||||||
|
time_begin = self.request.query_params.get('time_begin', None)
|
||||||
|
time_end = self.request.query_params.get('time_end', None)
|
||||||
|
|
||||||
|
if post_id is not None:
|
||||||
|
queryset = queryset.filter(post=post_id)
|
||||||
|
|
||||||
|
if time_begin is not None and time_end is not None:
|
||||||
|
# Parse the datetime strings to timezone-aware datetime objects
|
||||||
|
time_begin_parsed = parse_datetime(time_begin)
|
||||||
|
time_end_parsed = parse_datetime(time_end)
|
||||||
|
|
||||||
|
# Ensure datetime objects are timezone-aware
|
||||||
|
if time_begin_parsed is not None and time_end_parsed is not None:
|
||||||
|
queryset = queryset.filter(created_at__gte=time_begin_parsed, created_at__lte=time_end_parsed)
|
||||||
|
else:
|
||||||
|
# Handle invalid datetime format
|
||||||
|
# This is where you could log an error or handle the case where datetime strings are invalid
|
||||||
|
pass
|
||||||
|
|
||||||
|
return queryset
|
@ -17,12 +17,12 @@ Including another URLconf
|
|||||||
from django.contrib import admin
|
from django.contrib import admin
|
||||||
from django.urls import path, include
|
from django.urls import path, include
|
||||||
from rest_framework.routers import DefaultRouter
|
from rest_framework.routers import DefaultRouter
|
||||||
from pokemans_app.views import SubmissionViewSet, SubmissionAnalyticsViewSet
|
from pokemans_app.views import PostViewSet, PostAnalyticsViewSet
|
||||||
|
|
||||||
|
|
||||||
router = DefaultRouter()
|
router = DefaultRouter()
|
||||||
router.register(r"submissions", SubmissionViewSet)
|
router.register(r"posts", PostViewSet)
|
||||||
router.register(r"submission_analytics", SubmissionAnalyticsViewSet)
|
router.register(r"post_analytics", PostAnalyticsViewSet)
|
||||||
|
|
||||||
urlpatterns = [
|
urlpatterns = [
|
||||||
path("admin/", admin.site.urls),
|
path("admin/", admin.site.urls),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user