-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtwitter_username.py
113 lines (101 loc) · 4.13 KB
/
twitter_username.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# -*- coding: utf-8 -*-
"""
@author: apurv
"""
import json
import tweepy #https://github.com/tweepy/tweepy
import csv
import pandas as pd
import time
import numpy as np
#Twitter API credentials
consumer_key = ""
consumer_secret = ""
access_key = ""
access_secret = ""
l=[]
def get_all_tweets(screen_name):
#Twitter only allows access to a users most recent 3240 tweets with this method
#authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
#initialize a list to hold all the tweepy Tweets
alltweets = []
#make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name = screen_name,count=200)
#save most recent tweets
alltweets.extend(new_tweets)
#save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
#keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
print "getting tweets before %s" % (oldest)
new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)
alltweets.extend(new_tweets) #save most recent tweets
#update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
print "...%s tweets downloaded so far" % (len(alltweets))
userd=['screen_name','description','favourites_count','followers_count','friends_count','id','location','name','listed_count','statuses_count','created_at']
extd=['id','retweet_count','text','retweeted','lang','favorited','in_reply_to_screen_name','in_reply_to_status_id','created_at','in_reply_to_user_id']
lst=[]
for i in range(len(alltweets)):
dct=json.loads(json.dumps(alltweets[i]._json))
d={}
dct1=dct['user']
dct1 = { key:value for key,value in dct1.items() if key in userd }
dct1['user_id'] = dct1.pop('id')
dct1['user_created_at'] = dct1.pop('created_at')
dct2 = { key:value for key,value in dct.items() if key in extd }
d = dict(dct1.items() + dct2.items())
lst.append(d)
return lst
#transform the tweepy tweets into a 2D array that will populate the csv
if __name__ == '__main__':
#pass in the username of the account you want to download
u1=[]
u2=[]
u3=[]
#pass in the username of the account you want to download
ulst=[u1,u2,u3]
for j in ulst:
for i in j:
print i
try:
l=l+get_all_tweets(i)
time.sleep(60)
except:
continue
time.sleep(1000)
df=pd.DataFrame(l)
df.to_csv('outak1_raw.csv', sep=',', encoding='utf-8') #Output FIle
df1 = df.replace(np.nan,' ', regex=True)
user_nm = [x.encode('utf-8') for x in df1['screen_name'].unique().tolist()]
keywd = ['sample','test'] #keywords
wd = '|'.join(keywd)
met_lst = []
#i = user_nm[0]
#type(df2.retweeted[0])
for i in user_nm:
df2 = df1[df1['screen_name']==i]
df3 = df2[df2.text.str.contains(wd)]
d_met = {}
d_met['sreen_name'] = i
d_met['User_name'] = [x.encode('utf-8') for x in df2.name.unique().tolist()]
d_met['Total_tweets'] = df2.statuses_count.max()
d_met['Tweets_scrapped'] = len(df2.index)
d_met['Max_num_rt'] = df2.retweet_count.max()
d_met['Max_fav_ct'] = df2.favourites_count.max()
d_met['Follower_count'] = df2.followers_count.max()
d_met['friends_count'] = df2.friends_count.max()
d_met['sample_tweets'] = len(df3.index)
d_met['Per_sample_tweets'] = len(df3.index)/df2.statuses_count.max()
d_met['Perc_orig'] = len(df2[df2.retweeted.astype(int)==0].index)*100/df2.statuses_count.max()
d_met['Perc_rt'] = len(df2[df2.retweeted.astype(int)==1].index)*100/df2.statuses_count.max()
if len(df3[df3.retweeted.astype(int)==0].index)==0:
d_met['avg rt/t_sample'] = 'inf'
else:
d_met['avg rt/t_sample'] = len(df3[df3.retweeted.astype(int)==1].index)*100/len(df3[df3.retweeted.astype(int)==0].index)
met_lst.append(d_met)
df_met = pd.DataFrame(met_lst)
df.to_csv('met_oak1.csv', sep=',', encoding='utf-8')#metric File