broken bad code lole
This commit is contained in:
14
scraper/Dockerfile
Normal file
14
scraper/Dockerfile
Normal file
@@ -0,0 +1,14 @@
|
||||
FROM python:3.11
|
||||
|
||||
# Set environment variables
|
||||
ENV PYTHONDONTWRITEBYTECODE 1
|
||||
ENV PYTHONUNBUFFERED 1
|
||||
|
||||
# Set the working directory in the container
|
||||
WORKDIR /app
|
||||
|
||||
# Install any needed packages specified in requirements.txt
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
CMD ["python", "main.py"]
|
103
scraper/app.py
Normal file
103
scraper/app.py
Normal file
@@ -0,0 +1,103 @@
|
||||
import threading
|
||||
import time
|
||||
from datetime import datetime
|
||||
import requests
|
||||
from models import Submission
|
||||
import logging
|
||||
|
||||
# logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
class Application:
|
||||
def __init__(self, reddit_monitor, webhook_notifier, api_url):
|
||||
self.reddit_monitor = reddit_monitor
|
||||
self.webhook_notifier = webhook_notifier
|
||||
self.api_url = api_url
|
||||
|
||||
def send_api_request(self, method, url, data=None, params=None):
|
||||
response = requests.request(method, url, data=data, params=params)
|
||||
return response.json()
|
||||
|
||||
def get_submission_by_reddit_id(self, reddit_id):
|
||||
logging.info(f"Getting submission by reddit_id: {reddit_id}")
|
||||
logging.info(f"{self.api_url}submissions/?reddit_id={reddit_id}")
|
||||
response = self.send_api_request("GET", f"{self.api_url}submissions/?reddit_id={reddit_id}")
|
||||
logging.info(response)
|
||||
return response
|
||||
|
||||
def submission_exists(self, reddit_id):
|
||||
response = self.get_submission_by_reddit_id(reddit_id)
|
||||
if len(response) == 0:
|
||||
logging.info(f"Submission {reddit_id} does not exist")
|
||||
return False
|
||||
return True
|
||||
|
||||
def update_submission_analytics(self, submission):
|
||||
submission_id = self.get_submission_by_reddit_id(submission.reddit_id)
|
||||
logging.info(submission_id)
|
||||
submission_id = submission_id[0]["id"]
|
||||
data = {
|
||||
"id": submission_id,
|
||||
"score": submission.score,
|
||||
"num_comments": submission.num_comments,
|
||||
}
|
||||
self.send_api_request("PATCH", f"{self.api_url}submissions/{submission_id}/", data=data)
|
||||
|
||||
def get_submissions_to_update(self):
|
||||
submissions_to_update = self.send_api_request("GET", f"{self.api_url}submissions/?last_7_days=1")
|
||||
return submissions_to_update
|
||||
|
||||
def insert_submission(self, submission):
|
||||
data = {
|
||||
"reddit_id": submission.reddit_id,
|
||||
"title": submission.title,
|
||||
"name": submission.name,
|
||||
"url": submission.url,
|
||||
"created_utc": submission.created_utc,
|
||||
"selftext": submission.selftext,
|
||||
"permalink": submission.permalink,
|
||||
"upvote_ratio": submission.upvote_ratio,
|
||||
}
|
||||
response = self.send_api_request("POST", f"{self.api_url}submissions/", data=data)
|
||||
logging.info("Inserting submission")
|
||||
logging.info(response)
|
||||
|
||||
def process_submissions(self, submissions):
|
||||
for submission in submissions:
|
||||
submission = Submission(
|
||||
reddit_id=submission.id,
|
||||
title=submission.title,
|
||||
name=submission.name,
|
||||
url=submission.url,
|
||||
score=submission.score,
|
||||
num_comments=submission.num_comments,
|
||||
created_utc=submission.created_utc,
|
||||
selftext=submission.selftext,
|
||||
permalink=submission.permalink,
|
||||
upvote_ratio=submission.upvote_ratio
|
||||
)
|
||||
if self.submission_exists(submission.reddit_id):
|
||||
self.update_submission_analytics(submission)
|
||||
else:
|
||||
self.insert_submission(submission)
|
||||
self.update_submission_analytics(submission)
|
||||
self.webhook_notifier.send_notification(submission)
|
||||
|
||||
def periodic_update(self):
|
||||
to_be_updated = self.get_submissions_to_update()
|
||||
submissions = self.reddit_monitor.update_submissions(to_be_updated)
|
||||
self.process_submissions(submissions)
|
||||
|
||||
def run_periodic_update(self, interval=3600):
|
||||
while True:
|
||||
self.periodic_update()
|
||||
print(f"Existing posts Updated at {datetime.now()}")
|
||||
time.sleep(interval)
|
||||
|
||||
def run(self):
|
||||
#update_frequency = 3600 # 3600
|
||||
#update_thread = threading.Thread(target=self.run_periodic_update, args=(update_frequency, ))
|
||||
#update_thread.daemon = True
|
||||
#update_thread.start()
|
||||
submissions = self.reddit_monitor.stream_submissions()
|
||||
self.process_submissions(submissions)
|
14
scraper/config.py
Normal file
14
scraper/config.py
Normal file
@@ -0,0 +1,14 @@
|
||||
import os
|
||||
|
||||
|
||||
class Config:
|
||||
PRAW_CLIENT_ID = os.getenv("PRAW_CLIENT_ID")
|
||||
PRAW_CLIENT_SECRET = os.getenv("PRAW_CLIENT_SECRET")
|
||||
PRAW_USERNAME = os.getenv("PRAW_USERNAME")
|
||||
PRAW_PASSWORD = os.getenv("PRAW_PASSWORD")
|
||||
POKEMANS_WEBHOOK_URL = os.getenv("POKEMANS_WEBHOOK_URL")
|
||||
PKMN_ENV = 'dev' # os.getenv("PKMN_ENV")
|
||||
SUBREDDIT_NAME = "pkmntcgdeals"
|
||||
USER_AGENT = "praw:zman.video_repost_bot:v0.1.0 (by u/jzman21)"
|
||||
DISABLE_WEBHOOK = False
|
||||
API_URL = "http://server:8000/api/"
|
47
scraper/main.py
Normal file
47
scraper/main.py
Normal file
@@ -0,0 +1,47 @@
|
||||
from reddit_monitor import RedditMonitor
|
||||
from webhook import WebhookNotifier
|
||||
from app import Application
|
||||
from config import Config
|
||||
import logging
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
client_id = Config.PRAW_CLIENT_ID
|
||||
client_secret = Config.PRAW_CLIENT_SECRET
|
||||
user_agent = Config.USER_AGENT
|
||||
username = Config.PRAW_USERNAME
|
||||
password = Config.PRAW_PASSWORD
|
||||
subreddit_name = Config.SUBREDDIT_NAME
|
||||
discord_webhook_url = Config.POKEMANS_WEBHOOK_URL
|
||||
disable_webhook = Config.DISABLE_WEBHOOK
|
||||
pkmn_env = Config.PKMN_ENV
|
||||
api_url = Config.API_URL
|
||||
|
||||
# logging
|
||||
logging.basicConfig(filename='scraper.log', level=logging.DEBUG)
|
||||
logging.info('Starting scraper')
|
||||
|
||||
reddit_monitor = RedditMonitor(client_id, client_secret, user_agent, username, password, subreddit_name)
|
||||
webhook_notifier = WebhookNotifier(discord_webhook_url, disable_webhook)
|
||||
app = Application(reddit_monitor, webhook_notifier, api_url)
|
||||
app.run()
|
||||
|
||||
"""
|
||||
TODO:
|
||||
- django rest framework
|
||||
- api for managing database
|
||||
- remove scraper models
|
||||
- connect scraper to django rest framework api
|
||||
- pull upvote ration into analytics?
|
||||
- sqlite vs postgres figure out
|
||||
- basic front end (react)
|
||||
- tests
|
||||
- logging
|
||||
- Filter out canadian/uk deals
|
||||
- track score and number of comments over time in db
|
||||
- try to identify product, number of cards, price per card, etc
|
||||
- track price over time for each product
|
||||
- try to identify platform ie. costco for gift card, tiktok for coupons, etc.
|
||||
- support for craigslist, ebay, etc.
|
||||
- front end - vizualization, classification, lookup, etc.
|
||||
"""
|
15
scraper/models.py
Normal file
15
scraper/models.py
Normal file
@@ -0,0 +1,15 @@
|
||||
class Submission():
|
||||
def __init__(self, reddit_id, title, name, url, score, num_comments, created_utc, selftext, permalink, upvote_ratio):
|
||||
self.reddit_id = reddit_id
|
||||
self.title = title
|
||||
self.name = name
|
||||
self.url = url
|
||||
self.score = score
|
||||
self.num_comments = num_comments
|
||||
self.created_utc = created_utc
|
||||
self.selftext = selftext
|
||||
self.permalink = permalink
|
||||
self.upvote_ratio = upvote_ratio
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.reddit_id} {self.title} {self.name} {self.url} {self.score} {self.num_comments} {self.created_utc} {self.selftext} {self.permalink} {self.upvote_ratio}"
|
23
scraper/reddit_monitor.py
Normal file
23
scraper/reddit_monitor.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import praw
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
class RedditMonitor:
|
||||
def __init__(self, client_id, client_secret, user_agent, username, password, subreddit_name):
|
||||
self.reddit = praw.Reddit(
|
||||
client_id=client_id,
|
||||
client_secret=client_secret,
|
||||
user_agent=user_agent,
|
||||
username=username,
|
||||
password=password
|
||||
)
|
||||
self.subreddit = self.reddit.subreddit(subreddit_name)
|
||||
|
||||
def stream_submissions(self):
|
||||
for submission in self.subreddit.stream.submissions():
|
||||
yield submission
|
||||
|
||||
def update_submissions(self, submissions_to_update):
|
||||
for submission in submissions_to_update:
|
||||
praw_submission = self.reddit.submission(id=submission['reddit_id'])
|
||||
yield praw_submission
|
16
scraper/requirements.txt
Normal file
16
scraper/requirements.txt
Normal file
@@ -0,0 +1,16 @@
|
||||
asgiref==3.7.2
|
||||
certifi==2024.2.2
|
||||
charset-normalizer==3.3.2
|
||||
Django==5.0.2
|
||||
djangorestframework==3.14.0
|
||||
greenlet==3.0.3
|
||||
idna==3.6
|
||||
praw==7.7.1
|
||||
prawcore==2.4.0
|
||||
pytz==2024.1
|
||||
requests==2.31.0
|
||||
sqlparse==0.4.4
|
||||
typing_extensions==4.10.0
|
||||
update-checker==0.18.0
|
||||
urllib3==2.2.1
|
||||
websocket-client==1.7.0
|
21
scraper/webhook.py
Normal file
21
scraper/webhook.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import requests
|
||||
|
||||
|
||||
class WebhookNotifier:
|
||||
def __init__(self, webhook_url, disable_webhook=False):
|
||||
self.webhook_url = webhook_url
|
||||
self.disable_webhook = disable_webhook
|
||||
|
||||
def send_notification(self, submission):
|
||||
title = submission.title
|
||||
url = submission.url
|
||||
permalink = submission.permalink
|
||||
selftext = submission.selftext
|
||||
content = f"""
|
||||
**New Deal!**
|
||||
**Title:** {title}
|
||||
**URL:** {url}
|
||||
**Permalink:** https://old.reddit.com{permalink}
|
||||
**Selftext:** {selftext}"""
|
||||
if not self.disable_webhook:
|
||||
requests.post(self.webhook_url, data={"content": content})
|
Reference in New Issue
Block a user