-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathdataset_shuffler_ABR.py
61 lines (54 loc) · 3.07 KB
/
dataset_shuffler_ABR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# !/bin/python
import datetime
import psycopg2
from time import time
from time import sleep
import sys
def main():
shuffles = ['1','2','3']
# Open connection
conn = psycopg2.connect(host="/tmp/", database="mettas", user="mettas", port="1997")
cur = conn.cursor()
# If only one argument, clean is provided, drop and remake tables, then exit
if (len(sys.argv) == 2 and sys.argv[1] == "clean"):
createTables(shuffles, conn)
exit()
def makeCopyString(scf, table):
print("Time of data reload start: " + str(datetime.datetime.now()) + '\n')
##### If you cannot access the below path, then download the data from the source https://github.com/sbharghav/1gig and change the below copy path accordingly
return "INSERT INTO %s%s SELECT * FROM %s ORDER BY RANDOM();" % (table, scf, table)
def createTables(shuffles, conn):
cur = conn.cursor()
for scf in shuffles:
print("Recreating shuffle: %s" % scf)
cur.execute("DROP TABLE IF EXISTS books_data" + scf + ";")
cur.execute("DROP TABLE IF EXISTS reviews" + scf + ";")
cur.execute("CREATE TABLE books_data" + scf + """ ( id INT,
title TEXT,
description TEXT,
authors TEXT,
image_url TEXT,
preview_link TEXT,
publisher TEXT,
published_date DATE,
info_link TEXT,
categories TEXT
);""")
cur.execute("CREATE TABLE reviews" + scf + """ ( id VARCHAR(13),
book_id TEXT,
title TEXT,
price DECIMAL(15, 2),
user_id VARCHAR(255),
profile_name TEXT,
helpfulness TEXT,
score DECIMAL(2, 1),
review_time TIMESTAMP,
summary TEXT,
text TEXT
);""")
cur.execute(makeCopyString(scf, "books_data"))
cur.execute(makeCopyString(scf, "reviews"))
conn.commit()
print('ABR done')
if __name__ == '__main__':
main()