broken bad code lole

This commit is contained in:
2024-03-03 23:01:01 -05:00
parent 91d55efd20
commit c16bbb5275
29 changed files with 538 additions and 154 deletions

14
scraper/Dockerfile Normal file
View File

@@ -0,0 +1,14 @@
FROM python:3.11
# Set environment variables
ENV PYTHONDONTWRITEBYTECODE 1
ENV PYTHONUNBUFFERED 1
# Set the working directory in the container
WORKDIR /app
# Install any needed packages specified in requirements.txt
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
CMD ["python", "main.py"]

103
scraper/app.py Normal file
View File

@@ -0,0 +1,103 @@
import threading
import time
from datetime import datetime
import requests
from models import Submission
import logging
# logging
logging.basicConfig(level=logging.INFO)
class Application:
def __init__(self, reddit_monitor, webhook_notifier, api_url):
self.reddit_monitor = reddit_monitor
self.webhook_notifier = webhook_notifier
self.api_url = api_url
def send_api_request(self, method, url, data=None, params=None):
response = requests.request(method, url, data=data, params=params)
return response.json()
def get_submission_by_reddit_id(self, reddit_id):
logging.info(f"Getting submission by reddit_id: {reddit_id}")
logging.info(f"{self.api_url}submissions/?reddit_id={reddit_id}")
response = self.send_api_request("GET", f"{self.api_url}submissions/?reddit_id={reddit_id}")
logging.info(response)
return response
def submission_exists(self, reddit_id):
response = self.get_submission_by_reddit_id(reddit_id)
if len(response) == 0:
logging.info(f"Submission {reddit_id} does not exist")
return False
return True
def update_submission_analytics(self, submission):
submission_id = self.get_submission_by_reddit_id(submission.reddit_id)
logging.info(submission_id)
submission_id = submission_id[0]["id"]
data = {
"id": submission_id,
"score": submission.score,
"num_comments": submission.num_comments,
}
self.send_api_request("PATCH", f"{self.api_url}submissions/{submission_id}/", data=data)
def get_submissions_to_update(self):
submissions_to_update = self.send_api_request("GET", f"{self.api_url}submissions/?last_7_days=1")
return submissions_to_update
def insert_submission(self, submission):
data = {
"reddit_id": submission.reddit_id,
"title": submission.title,
"name": submission.name,
"url": submission.url,
"created_utc": submission.created_utc,
"selftext": submission.selftext,
"permalink": submission.permalink,
"upvote_ratio": submission.upvote_ratio,
}
response = self.send_api_request("POST", f"{self.api_url}submissions/", data=data)
logging.info("Inserting submission")
logging.info(response)
def process_submissions(self, submissions):
for submission in submissions:
submission = Submission(
reddit_id=submission.id,
title=submission.title,
name=submission.name,
url=submission.url,
score=submission.score,
num_comments=submission.num_comments,
created_utc=submission.created_utc,
selftext=submission.selftext,
permalink=submission.permalink,
upvote_ratio=submission.upvote_ratio
)
if self.submission_exists(submission.reddit_id):
self.update_submission_analytics(submission)
else:
self.insert_submission(submission)
self.update_submission_analytics(submission)
self.webhook_notifier.send_notification(submission)
def periodic_update(self):
to_be_updated = self.get_submissions_to_update()
submissions = self.reddit_monitor.update_submissions(to_be_updated)
self.process_submissions(submissions)
def run_periodic_update(self, interval=3600):
while True:
self.periodic_update()
print(f"Existing posts Updated at {datetime.now()}")
time.sleep(interval)
def run(self):
#update_frequency = 3600 # 3600
#update_thread = threading.Thread(target=self.run_periodic_update, args=(update_frequency, ))
#update_thread.daemon = True
#update_thread.start()
submissions = self.reddit_monitor.stream_submissions()
self.process_submissions(submissions)

14
scraper/config.py Normal file
View File

@@ -0,0 +1,14 @@
import os
class Config:
PRAW_CLIENT_ID = os.getenv("PRAW_CLIENT_ID")
PRAW_CLIENT_SECRET = os.getenv("PRAW_CLIENT_SECRET")
PRAW_USERNAME = os.getenv("PRAW_USERNAME")
PRAW_PASSWORD = os.getenv("PRAW_PASSWORD")
POKEMANS_WEBHOOK_URL = os.getenv("POKEMANS_WEBHOOK_URL")
PKMN_ENV = 'dev' # os.getenv("PKMN_ENV")
SUBREDDIT_NAME = "pkmntcgdeals"
USER_AGENT = "praw:zman.video_repost_bot:v0.1.0 (by u/jzman21)"
DISABLE_WEBHOOK = False
API_URL = "http://server:8000/api/"

47
scraper/main.py Normal file
View File

@@ -0,0 +1,47 @@
from reddit_monitor import RedditMonitor
from webhook import WebhookNotifier
from app import Application
from config import Config
import logging
if __name__ == "__main__":
client_id = Config.PRAW_CLIENT_ID
client_secret = Config.PRAW_CLIENT_SECRET
user_agent = Config.USER_AGENT
username = Config.PRAW_USERNAME
password = Config.PRAW_PASSWORD
subreddit_name = Config.SUBREDDIT_NAME
discord_webhook_url = Config.POKEMANS_WEBHOOK_URL
disable_webhook = Config.DISABLE_WEBHOOK
pkmn_env = Config.PKMN_ENV
api_url = Config.API_URL
# logging
logging.basicConfig(filename='scraper.log', level=logging.DEBUG)
logging.info('Starting scraper')
reddit_monitor = RedditMonitor(client_id, client_secret, user_agent, username, password, subreddit_name)
webhook_notifier = WebhookNotifier(discord_webhook_url, disable_webhook)
app = Application(reddit_monitor, webhook_notifier, api_url)
app.run()
"""
TODO:
- django rest framework
- api for managing database
- remove scraper models
- connect scraper to django rest framework api
- pull upvote ration into analytics?
- sqlite vs postgres figure out
- basic front end (react)
- tests
- logging
- Filter out canadian/uk deals
- track score and number of comments over time in db
- try to identify product, number of cards, price per card, etc
- track price over time for each product
- try to identify platform ie. costco for gift card, tiktok for coupons, etc.
- support for craigslist, ebay, etc.
- front end - vizualization, classification, lookup, etc.
"""

15
scraper/models.py Normal file
View File

@@ -0,0 +1,15 @@
class Submission():
def __init__(self, reddit_id, title, name, url, score, num_comments, created_utc, selftext, permalink, upvote_ratio):
self.reddit_id = reddit_id
self.title = title
self.name = name
self.url = url
self.score = score
self.num_comments = num_comments
self.created_utc = created_utc
self.selftext = selftext
self.permalink = permalink
self.upvote_ratio = upvote_ratio
def __str__(self):
return f"{self.reddit_id} {self.title} {self.name} {self.url} {self.score} {self.num_comments} {self.created_utc} {self.selftext} {self.permalink} {self.upvote_ratio}"

23
scraper/reddit_monitor.py Normal file
View File

@@ -0,0 +1,23 @@
import praw
from datetime import datetime, timedelta
class RedditMonitor:
def __init__(self, client_id, client_secret, user_agent, username, password, subreddit_name):
self.reddit = praw.Reddit(
client_id=client_id,
client_secret=client_secret,
user_agent=user_agent,
username=username,
password=password
)
self.subreddit = self.reddit.subreddit(subreddit_name)
def stream_submissions(self):
for submission in self.subreddit.stream.submissions():
yield submission
def update_submissions(self, submissions_to_update):
for submission in submissions_to_update:
praw_submission = self.reddit.submission(id=submission['reddit_id'])
yield praw_submission

16
scraper/requirements.txt Normal file
View File

@@ -0,0 +1,16 @@
asgiref==3.7.2
certifi==2024.2.2
charset-normalizer==3.3.2
Django==5.0.2
djangorestframework==3.14.0
greenlet==3.0.3
idna==3.6
praw==7.7.1
prawcore==2.4.0
pytz==2024.1
requests==2.31.0
sqlparse==0.4.4
typing_extensions==4.10.0
update-checker==0.18.0
urllib3==2.2.1
websocket-client==1.7.0

21
scraper/webhook.py Normal file
View File

@@ -0,0 +1,21 @@
import requests
class WebhookNotifier:
def __init__(self, webhook_url, disable_webhook=False):
self.webhook_url = webhook_url
self.disable_webhook = disable_webhook
def send_notification(self, submission):
title = submission.title
url = submission.url
permalink = submission.permalink
selftext = submission.selftext
content = f"""
**New Deal!**
**Title:** {title}
**URL:** {url}
**Permalink:** https://old.reddit.com{permalink}
**Selftext:** {selftext}"""
if not self.disable_webhook:
requests.post(self.webhook_url, data={"content": content})