forked from bpb27/twitter_scraping
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
93 lines (74 loc) · 3 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from time import sleep
import json
import datetime
# edit these three variables
user = 'realdonaldtrump'
start = datetime.datetime(2010, 1, 1) # year, month, day
end = datetime.datetime(2016, 12, 7) # year, month, day
# only edit these if you're having problems
delay = 1 # time to wait on each page load before reading the page
driver = webdriver.Safari() # options are Chrome() Firefox() Safari()
# don't mess with this stuff
twitter_ids_filename = 'all_ids.json'
days = (end - start).days + 1
id_selector = '.time a.tweet-timestamp'
tweet_selector = 'li.js-stream-item'
user = user.lower()
ids = []
def format_day(date):
day = '0' + str(date.day) if len(str(date.day)) == 1 else str(date.day)
month = '0' + str(date.month) if len(str(date.month)) == 1 else str(date.month)
year = str(date.year)
return '-'.join([year, month, day])
def form_url(since, until):
p1 = 'https://twitter.com/search?f=tweets&vertical=default&q=from%3A'
p2 = user + '%20since%3A' + since + '%20until%3A' + until + 'include%3Aretweets&src=typd'
return p1 + p2
def increment_day(date, i):
return date + datetime.timedelta(days=i)
for day in range(days):
d1 = format_day(increment_day(start, 0))
d2 = format_day(increment_day(start, 1))
url = form_url(d1, d2)
print(url)
print(d1)
driver.get(url)
sleep(delay)
try:
found_tweets = driver.find_elements_by_css_selector(tweet_selector)
increment = 10
while len(found_tweets) >= increment:
print('scrolling down to load more tweets')
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
sleep(delay)
found_tweets = driver.find_elements_by_css_selector(tweet_selector)
increment += 10
print('{} tweets found, {} total'.format(len(found_tweets), len(ids)))
for tweet in found_tweets:
try:
id = tweet.find_element_by_css_selector(id_selector).get_attribute('href').split('/')[-1]
ids.append(id)
except StaleElementReferenceException as e:
print('lost element reference', tweet)
except NoSuchElementException:
print('no tweets on this day')
start = increment_day(start, 1)
try:
with open(twitter_ids_filename) as f:
all_ids = ids + json.load(f)
data_to_write = list(set(all_ids))
print('tweets found on this scrape: ', len(ids))
print('total tweet count: ', len(data_to_write))
except FileNotFoundError:
with open(twitter_ids_filename, 'w') as f:
all_ids = ids
data_to_write = list(set(all_ids))
print('tweets found on this scrape: ', len(ids))
print('total tweet count: ', len(data_to_write))
with open(twitter_ids_filename, 'w') as outfile:
json.dump(data_to_write, outfile)
print('all done here')
driver.close()