I am a moderator of /r/sweepstakes on Reddit, which lets users post their referral links to contests/sweepstakes. One main rule is that a user is not allowed to post their link to a contest, if another user has already done so. It's not so simple checking for reposts since all referral links have a different URL (i.e. contest.com/?ref=Kevin
& contest.com/?ref=Steve
).
I thought a good way to find a repost is to retrieve the title of the webpage (the <title>
tag) and store it in a database along with some other vital information.
It scans the subreddit every 15m for new posts. For every post it does the following:
See if we have already looked at the post by searching the DB for the
pid
(PostId). If we have that, skip and move onto the next post.Get the final URL using
urllib
. Some URLs redirect to another webpage (i.e. bit.ly links)Get the title (
<title>
) of the webpage by usingBeautifulSoup
.Search the DB for the Title. If the title is in the database, then that means the submitted post is a repost and we want to retrieve some information on the original post (
permalink
,submitter
). We add this information to a string that will be sent to the moderators.If the submitted post's title does not already exist in the database, then it is a unique post and we will add it to the database.
Once all posts have been processed, send the message of all reposts to the moderators for them to manually inspect.
I ran into a lot of issues and they predominantly had to do with finding the final URL of the post and finding the page's title. To keep things simple, I may end up removing the function to find a URL's final URL, since it isn't very important.
I ran into ASCII/Unicode issues and I kept getting UnicodeEncodeError/UnicodeDecodeError
exceptions.
Suggestions on how to improve the code would be appreciated.
import traceback
import praw # simple interface to the reddit API, also handles rate limiting of requests
import time
import sqlite3
import re
from urlparse import urlparse
import urllib
import urllib2
from BeautifulSoup import BeautifulSoup
import requests
'''USER CONFIGURATION'''
APP_ID = 'XXXX'
APP_SECRET = 'XXXX'
APP_URI = 'XXXX'
APP_REFRESH = 'XXXX'
USERAGENT = 'XXXX'
SUBREDDIT = "XXXX"
MAXPOSTS = 30
WAIT = 900 #15m This is how many seconds you will wait between cycles. The bot is completely inactive during this time.
# Resolve redirects for a URL. i.e. bit.ly/XXXX --> somesite.com/blahblah
# Also input # of retries in case rate-limit
def resolve_redirects(url, tries):
tries -= 1
try:
req = urllib2.Request(url, headers={'User-Agent' : "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.125 Safari/537.36"}) # User agent since some sites block python/urllib2 useragent
return urllib2.urlopen(req)
except urllib2.HTTPError, e:
print('HTTPError: ' + str(e.code) + ': ' + domain)
if (e.code == 403 or e.code == 429) and tries > 0:
time.sleep(5)
resolve_redirects(url, tries)
except urllib2.URLError, e:
print('URLError: ' + str(e.reason) + ': ' + domain)
except Exception:
import traceback
print('Generic Exception: ' + traceback.format_exc())
# Get title of webpage if possible. Otherwise just set the page title equal to the pages URL
def get_title(url):
try:
title = BeautifulSoup(url).title.string.strip()
except AttributeError:
title = url.geturl()
return title.encode('utf-8').strip()
# Load Database
sql = sqlite3.connect('Reddit_DB.db')
print('Loaded SQL Database')
cur = sql.cursor()
# Create Table and Login to Reddit
cur.execute('CREATE TABLE IF NOT EXISTS duplicates(id TEXT, permalink TEXT, domain TEXT, url TEXT, title TEXT, submitter TEXT)')
sql.commit()
print('Logging in...')
r = praw.Reddit(USERAGENT)
r.set_oauth_app_info(APP_ID, APP_SECRET, APP_URI)
r.refresh_access_information(APP_REFRESH)
# Main portion of code
def replybot():
print('Searching %s @ %s' % (SUBREDDIT, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))))
subreddit = r.get_subreddit(SUBREDDIT)
submissions = list(subreddit.get_new(limit=MAXPOSTS))
msg = ""
for post in submissions:
global domain # Need to be global to use in resolve_redirects()
pid = post.id
try:
author = post.author.name
except AttributeError:
print "AttributeError: Author is deleted"
continue
# See if we have already looked at this post before. If we have, skip it.
cur.execute('SELECT * FROM duplicates WHERE ID=?', [pid])
sql.commit()
if cur.fetchone(): # Post is already in the database
continue
url = post.url
domain = post.domain
if domain == "self." + str(SUBREDDIT): # Skip self posts
continue
# Get the final url after redirects (i.e. in case URL redirects to a different URL)
try:
post_url = resolve_redirects(url, 3)
effective_url = post_url.geturl()
except AttributeError:
print "AttributeError: Post URL/Effective URL"
continue
# Get Title of webpage in Final URL
try:
post_title = get_title(post_url).encode('utf-8').strip()
except UnicodeDecodeError:
post_title = unicode(get_title(post_url).strip(),"utf-8")
except UnicodeEncodeError:
print "UnicodeError: " + post.title
continue
# Check if the post is a repost by seeing if the Title already exists. If it does, get the Repost's permalink, title, submitter and create the message. Otherwise post is unique and is added to DB
cur.execute('SELECT * FROM duplicates where TITLE=?', [post_title])
sql.commit()
row = cur.fetchone()
if row:
repost_permalink = row[1]
repost_title = row[4]
repost_submitter = row[5]
print "Found repost of %s by %s" % (post.title, author)
msg += 'Repost: [%s](%s) by /u/%s. Original: [Here](%s) by /u/%s.\n\n' % (post.title, post.permalink, author, repost_permalink, repost_submitter)
else:
cur.execute('INSERT INTO duplicates VALUES(?,?,?,?,?,?)', [pid, post.permalink, domain, effective_url, post_title, author])
sql.commit()
# If message exists (meaning there was a repost), send message to moderators
if len(msg) > 0:
r.send_message('/r/sweepstakes', 'Possible Repost', msg)
print "Sent message"
else:
print "Nothing to send"
cycles = 0
while True:
try:
# Keep refresh alive by refreshing every 45m
if cycles % 3 == 0:
r.refresh_access_information(APP_REFRESH)
print "Refreshed OAuth"
replybot()
cycles += 1
except Exception as e:
traceback.print_exc()
time.sleep(WAIT)