-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfilter_dataset.py
38 lines (27 loc) · 1.22 KB
/
filter_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import sys
import csv
import os
import json
from tqdm import tqdm
from utils import *
users_location = "../dataset_tweego/users"
# friends = []
# with open('{}/all.txt'.format(dump_location)) as f:
# for line in f:
# friends.append(str(line).strip())
user_dict = json.load(open("{}/user_map_all.json".format(dump_location)))
friends = user_dict.keys()
filtered_friends = {}
for friend in tqdm(friends):
if user_dict[friend]["total_count"] > 2:
if os.path.exists("{}/{}.json".format(users_location, str(friend))):
user = json.load(open("{}/{}.json".format(users_location, str(friend)), "r"))
if user["followers_count"] >= 5000 and user["friends_count"] <= 5000:
filtered_friends[friend] = {}
filtered_friends[friend]["followers_count"] = user["followers_count"]
filtered_friends[friend]["total_count"] = user_dict[friend]["total_count"]
print(len(filtered_friends.keys()))
with open('{}/all_10k.txt'.format(dump_location), 'w', encoding='utf-8') as f:
f.write(str.join('\n', (str(x) for x in filtered_friends.keys())))
json.dump(filtered_friends, open(
"{}/user_map_all_filtered.json".format(dump_location), "w"), indent=4)