-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathreddit_scraper.py
125 lines (94 loc) · 2.91 KB
/
reddit_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Created by @JamesBower on 20 Dec 2017
twitter.com/jamesbower
Inspired from: cheesinglee @ https://gist.github.com/cheesinglee
"""
import praw
import time
from time import gmtime
from datetime import datetime
import sys
import json
reload(sys)
sys.setdefaultencoding('utf-8')
############ Append JSON output to daily file ############
filename = 'reddit_'+ time.strftime('%Y-%m-%d', time.gmtime()) + '.json'
f = open(filename,"a+")
##########################################################
AUTH_PARAMS = {
'client_id': 'Left Blank',
'client_secret': 'Left Blank',
'password': 'Left Blank',
'username': 'Left Blank',
'user_agent': 'Reddit:Explorers:0.1 (by /u/blibblob)'}
POST_KEYS = ['name', 'url', 'title', 'created_utc', 'score', 'subreddit', 'domain', 'is_self', 'selftext_html', 'downs', 'ups']
SUBREDDITS = ['Put Subreddit Here']
SCRAPE_AUTHORS = True
processed_users = {}
def get_author_info(a):
if a:
if a.id in processed_users:
return processed_users[a.id]
else:
d = {}
d['author_name'] = a.name
t = gmtime(a.created_utc)
processed_users[a.id] = d
return d
else:
return {'author_name':'',
'author_created_sec_utc':None}
def process_post(post):
d = {}
postdict = vars(post)
for key in POST_KEYS:
val = postdict[key]
try:
val = val.lower()
except:
pass
d[key] = val
d['has_thumbnail'] = (post.thumbnail != 'default') and (post.thumbnail != 'self')
if d['has_thumbnail']:
d['image_url'] = post.preview['images'][0]['source']['url']
############# Comment scraping ##########################
post.comments.replace_more(limit=1)
comments = post.comments.list()
d['n_comments'] = len(list(comments))
d['comments'] = list(map(lambda x: x.body, comments))
#d['comments'] = list(map(comments))
#########################################################
############## Author scraping ##########################
if SCRAPE_AUTHORS:
author_dict = get_author_info(post.author)
for key,val in author_dict.items():
d[key] = val
del d['subreddit']
#########################################################
############# Print output to screen for debugging ######
#print(json.dumps(d))
#########################################################
############## Output JSON to file ######################
f.write(json.dumps(d)+'\n')
#########################################################
if __name__ == '__main__':
r = praw.Reddit(**AUTH_PARAMS)
# posts = {post_id: post_content}
posts = {}
if len(SUBREDDITS) > 0:
for subreddit in SUBREDDITS:
sub = r.subreddit(subreddit)
for post in sub.new(limit=1):
if post.id not in posts:
#print(post.title)
posts[post.id] = process_post(post)
#print('scraping hot posts...')
#for post in sub.hot(limit=5):
#if post.id not in posts:
#print(post.title)
#posts[post.id] = process_post(post)
else:
print('Choose a subreddit ...')
sys.exit(0)