Skip to content

Commit

Permalink
updated code
Browse files Browse the repository at this point in the history
  • Loading branch information
sophieball committed Feb 16, 2019
1 parent 57243fb commit 51f3e72
Show file tree
Hide file tree
Showing 6 changed files with 165 additions and 64 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ Details about these files are in the following section.

3. Run [`setup.py`](https://github.com/CMUSTRUDEL/oss-social-capital-icse2019/blob/master/setup.py), which reads the files `dict/alias_map_b.dict`,
`dict/reverse_alias_map_b.dict`, and `data/uid.list`, and generates files
`data/pid.list`, `data/all_contributors.list`, `dict/contr_projs.dict`,
`data/pid.list`, `data/all_contributors.list`,
`data/watchers_monthly_counts_win.csv`, `dict/contr_projs.dict`,
`data/all_projs.list`, and `dict/proj_contrs_count.dict`.

4. Run [`get_user_info.py`](https://github.com/CMUSTRUDEL/oss-social-capital-icse2019/blob/master/get_user_info.py), [`get_proj_info.py`](https://github.com/CMUSTRUDEL/oss-social-capital-icse2019/blob/master/get_proj_info.py), and [`get_user_proj_info.py`](https://github.com/CMUSTRUDEL/oss-social-capital-icse2019/blob/master/get_user_proj_info.py). They write to `data/results_users.csv`, `data/results_proj.csv`, and `data/results_user_proj.csv` repectively.
Expand Down
61 changes: 44 additions & 17 deletions get_proj_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,6 @@
print "Number of projects", len(pids)

# load contributors' projects
# query:
# user_projs_r = session.query(commits).filter( \
# commits.c.author_id.in_(aliases),
# commits.c.created_at >= begin,
# commits.c.created_at <= end,
# commits.c.project_id.isnot(None),
# commits.c.project_id != -1).distinct(commits.c.project_id)
# dict key: 1 (user id)
# dict value: [[], [], [], [28923L], [], [], [4121147L], [28923L], [2L, 28923L, 96621L]] (list of project ids)
f = open("dict/contr_projs.dict")
cont_projs_dict = pickle.load(f)
f.close()
Expand Down Expand Up @@ -151,10 +142,10 @@ def get_info(p):
contr_lens.append(len(contr_win[i]))
new_contr_lens.append(len(new_contr_win[i]))

if max(contr_lens) > 1000:
if max(contr_lens) > 1500:
# otherwise it woule take tooooo long to calculate team familiarity
big_repos.add(p)
return []
#print proc_id, "contr", p, contr_lens

# watchers
cur_stars = watchers.loc[watchers["project_id"].isin(forks)]
Expand All @@ -166,6 +157,11 @@ def get_info(p):
cur_win = int(cur_stars.iloc[i][["window"]])
if cur_win > 0 and cur_win < 37:
stars_count[cur_win-1] = int(cur_stars.iloc[i][["sum"]])
#print proc_id, "star", p, stars_count

# get users' projects per window to calculate team familiarity, lang
# diversity, and recurring cohesion
#contr_list_win = get_user_dict(p, helper.session)

# create dataframe
p_dicts = []
Expand All @@ -176,10 +172,10 @@ def get_info(p):
p_dict["p_id"] = np.int64(p)
p_dict["p_lang"] = p_lang
p_dict["p_owner"] = int(p_owner)
p_dict["p_age"] = act_win - min(act_wins)
p_dict["p_age"] = act_win - min(act_wins) # 0 based
p_dict["p_windows_active_to_date"] = win_index + 1
p_dict["p_team_size"] = contr_lens[act_win]
p_dict["p_num_users_to_date"] = sum(new_contr_lens[:act_win+1])
p_dict["p_num_users_to_date"] = sum(new_contr_lens[:act_win])
p_dict["p_num_stars"] = sum(stars_count[:act_win])
p_dict["p_num_commits"] = num_commits_win[act_win]
p_dict["p_num_commits_to_date"] = sum(num_commits_win[:act_win+1])
Expand Down Expand Up @@ -215,15 +211,40 @@ def get_info(p):
#print proc_id, p, p_dict, datetime.now()
out.write(str(p_dict)+",")
out.write(str(datetime.now())+"\n")
#out.write("\n".join([str(p_d) for p_d in p_dicts])+"\n")
#results = pd.concat([results, pd.DataFrame(p_dicts)])
#session.commit()

return p_dicts

'''
for i in range(6):
print "pid:", i*10000, (i+1)*10000
p_ids_sub = pids[(i-1)*10000:i*10000]
pool = Pool(num_proc)
results = []
results = pool.map(get_info, p_ids_sub)
pool.close()
pool.join()
result_f = open("result_f"+str(i), "wb")
pickle.dump(results, result_f)
result_f.close()
results = [dict_item for dict_lists in results for dict_item in dict_lists]
results = pd.DataFrame(results)
results.to_csv("data/proj_results"+str(i)+".csv", index = False)
'''


results = []
num_iter = len(pids) / 10000
iter_size = 5000
num_iter = len(pids) / iter_size
print len(pids)
for i in range(num_iter+1):
pool = Pool(num_proc)
print datetime.now(), (i+1)*10000
results_i = pool.map(get_info, pids[i*10000:(i+1)*10000])
print datetime.now(), (i+1)*iter_size
results_i = pool.map(get_info, pids[i*iter_size:(i+1)*iter_size])
results_i = [dict_item for dict_lists in results_i for dict_item in dict_lists]
results.extend(results_i)
pool.close()
Expand All @@ -232,5 +253,11 @@ def get_info(p):
pickle.dump(results_i, result_f)
result_f.close()

'''
results = []
for p in pids:
results.append(get_info(p))
'''

results = pd.DataFrame(results)
results.to_csv("data/results_proj.csv", index = False, encoding = "utf-8")
results.to_csv("data/results_proj_big.csv", index = False, encoding = "utf-8")
47 changes: 37 additions & 10 deletions get_user_info.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
from datetime import datetime
from math import floor
from multiprocessing import *
from project_lang_div import get_lang_div
from project_recur_co import get_recur_co
from project_team_famil import get_team_famil
from sqlalchemy import create_engine, MetaData, Table
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql import select
from utils import *
#from utils import *
import logging
import numpy as np
import os
Expand Down Expand Up @@ -40,14 +37,16 @@
re_alias = pickle.load(f)
f.close()

begin = datetime.strptime("2008-01-01", "%Y-%m-%d")
end = datetime.strptime("2016-12-31", "%Y-%m-%d")
langs = ["JavaScript", "Java", "Python", "CSS", "PHP", "Ruby", "C++",
"C", "Shell", "C#", "Objective-C", "R", "VimL", "Go", "Perl",
"CoffeeScript", "Tex", "Swift", "Scala", "Emacs Lisp", "Haskell",
"Lua", "Clojure", "Matlab", "Arduino", "Groovy", "Puppet", "Rust",
"PowerShell", "Erlang", "Visual Basic", "Processing", "Assembly", "Other"]
print "Done setting up"

url = "mysql://sophie:"+pswd+"@localhost/ghtorrent?charset=utf8mb4"
url = "mysql://sophie:"+pswd+"@localhost/ghtorrent-2018-03?charset=utf8mb4"
engine = create_engine(url, pool_size = num_proc, pool_recycle = 3600)
Session = sessionmaker(bind = engine)
metadata = MetaData(engine)
Expand All @@ -65,10 +64,26 @@
conns_n = engine_n.connect()
session_n = Session_n()

gender_f = open("data/gender.csv")
line = gender_f.readline()
line = gender_f.readline()
user_genders = {}
while len(line):
parts = line.strip().split(",")
user_genders[parts[0]] = parts[-1]
line = gender_f.readline()
gender_f.close()

def get_merged_id(re_alias, aid):
if aid in re_alias:
return re_alias[aid]
return aid

failed = open("u_failed", "w")
out = open("u_out", "w")
#for p_index, p in enumerate(pids):
def get_info(u):
print u
# if we don't have the user's project list, it means that this author has only
# contributed to large projects which we do not include in our model.
if u not in cont_projs_dict:
Expand All @@ -82,12 +97,21 @@ def get_info(u):
except:
aliases = tuple([u])


# get user basic info
r = session_n.query(namsor).filter(namsor.c.id == u)
u_info = r.first()
u_gender = u_info.gender

u_email = u_info.email
u_login = u_info.login
try:
u_gender = user_genders[str(u)]
if u_gender == 1:
u_gender = "Female"
elif u_gender == -1:
u_gender = "Male"
except:
u_gender = u_info.gender

# get user ages and active ages, niche width
u_projs = cont_projs_dict[u]
Expand All @@ -98,7 +122,9 @@ def get_info(u):
for i in range(36):
follower_win.append(set())

r = session.query(followers).filter(followers.c.user_id.in_(aliases))
r = session.query(followers).filter(followers.c.user_id.in_(aliases),
followers.c.created_at >= begin,
followers.c.created_at <= end)

for rr in r.all():
win = floor((rr.created_at.month-1)/3+1)+ (rr.created_at.year-2008)*4
Expand All @@ -116,8 +142,6 @@ def get_info(u):
for i in range(36):
commits_win[i] = 0
'''
begin = datetime.strptime("2008-01-01", "%Y-%m-%d")
end = datetime.strptime("2016-12-31", "%Y-%m-%d")
r = session.query(commits).filter(commits.c.author_id.in_(aliases),
commits.c.created_at >= begin,
commits.c.created_at <= end)
Expand Down Expand Up @@ -170,6 +194,8 @@ def get_info(u):
for lang in all_langs_list:
u_languages.add(lang)
u_dict["u_nichewidth"] = len(u_languages)
if len(u_languages) == 0:
u_dict["u_nichewidth"] = 1

u_dicts.append(u_dict)
#print proc_id, p, p_dict, datetime.now()
Expand All @@ -180,7 +206,7 @@ def get_info(u):
#session.commit()
conns.close()
return u_dicts
'''

pool = Pool(num_proc)
results = pool.map(get_info, uids)
pool.close()
Expand All @@ -193,6 +219,7 @@ def get_info(u):
results = []
for p in uids:
results.append(get_info(p))
'''

results = [dict_item for dict_lists in results for dict_item in dict_lists]
results = pd.DataFrame(results)
Expand Down
15 changes: 6 additions & 9 deletions get_user_proj_info.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
from datetime import datetime
from math import floor
from multiprocessing import *
from project_lang_div import get_lang_div
from project_recur_co import get_recur_co
from project_team_famil import get_team_famil
from sqlalchemy import create_engine, MetaData, Table
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql import select
Expand Down Expand Up @@ -63,7 +60,7 @@
"PowerShell", "Erlang", "Visual Basic", "Processing", "Assembly", "Other"]
print "Done setting up"

url = "mysql://sophie:"+pswd+"@localhost/ghtorrent?charset=utf8mb4"
url = "mysql://sophie:"+pswd+"@localhost/ghtorrent-2018-03?charset=utf8mb4"
engine = create_engine(url, pool_size = num_proc, pool_recycle = 3600)
Session = sessionmaker(bind = engine)
metadata = MetaData(engine)
Expand Down Expand Up @@ -101,16 +98,16 @@ def get_info(u_id):
act_wins = [win for win in range(36) if len(u_projs[win]) > 0]
for act_win in act_wins:
for p_id in u_projs[act_win]:
if p_id in big_repos:
if p_id in big_repos or p_id == -1:
continue
# no need to get root, we already stored roots
# get the list of forks
forks = tuple(root_forks[p_id])

# count the number of commits made by this contributor to this project
[begin, end] = windows[act_win].split(" ")
begin = datetime.strptime(begin, "%Y-%m-%d")
end = datetime.strptime(end, "%Y-%m-%d")
[begin, end] = windows[act_win].split("_")
begin = datetime.strptime(begin, "%Y-%m-%d %H:%M:%S")
end = datetime.strptime(end, "%Y-%m-%d %H:%M:%S")

r = session.query(commits).filter(commits.c.author_id.in_(aliases),
commits.c.project_id.in_(forks),
Expand Down Expand Up @@ -185,7 +182,7 @@ def get_info(u_id):
u_p_dict["u_pr_merge"] = 1

u_p_dicts.append(u_p_dict)
#print proc_id, p, p_dict, datetime.now()
#print proc_id, u_id, u_p_dict, datetime.now()
#results = pd.concat([results, pd.DataFrame(p_dicts)])
#session.commit()
conns.close()
Expand Down
Loading

0 comments on commit 51f3e72

Please sign in to comment.