-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathdataset_shuffler_GRBA.py
55 lines (47 loc) · 2.54 KB
/
dataset_shuffler_GRBA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# !/bin/python
import datetime
import psycopg2
from time import time
from time import sleep
import sys
def main():
shuffles = ['1','2','3']
# Open connection
conn = psycopg2.connect(host="/tmp/", database="mettas", user="mettas", port="1997")
cur = conn.cursor()
# If only one argument, clean is provided, drop and remake tables, then exit
if (len(sys.argv) == 2 and sys.argv[1] == "clean"):
createTables(shuffles, conn)
exit()
def makeCopyString(scf, table):
print("Time of data reload start: " + str(datetime.datetime.now()) + '\n')
##### If you cannot access the below path, then download the data from the source https://github.com/sbharghav/1gig and change the below copy path accordingly
return "INSERT INTO %s%s SELECT * FROM %s ORDER BY RANDOM();" % (table, scf, table)
def createTables(shuffles, conn):
cur = conn.cursor()
for scf in shuffles:
print("Recreating shuffle: %s" % scf)
cur.execute("DROP TABLE IF EXISTS gr_authors" + scf + ";")
cur.execute("DROP TABLE IF EXISTS gr_books" + scf + ";")
cur.execute("CREATE TABLE gr_books" + scf + """ ( id VARCHAR(10),
title TEXT,
author_name TEXT,
language TEXT,
original_publication_date DATE,
description TEXT
);""")
cur.execute("CREATE TABLE gr_authors" + scf + """ ( id TEXT,
name TEXT,
gender TEXT,
ratings_count INTEGER,
average_rating FLOAT,
text_reviews_count INTEGER,
works_count INTEGER,
fans_count INTEGER
);""")
cur.execute(makeCopyString(scf, "gr_authors"))
cur.execute(makeCopyString(scf, "gr_books"))
conn.commit()
print('GRBA done')
if __name__ == '__main__':
main()