This repository was archived by the owner on Dec 31, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbuild_dataset.py
111 lines (100 loc) · 4.43 KB
/
build_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import pandas as pd
import numpy as np
import random
import os
import argparse
import sys
import ast
import math
import pdb
# Check if a string represents an int
def repsInt(test_str):
try:
int(test_str)
return True
except ValueError:
return False
def parse_args():
# Create parser and add an argument to specify which directory the data is in
parser = argparse.ArgumentParser(description="")
parser.add_argument('-f','--filename', default='data/2017CHR_CSV_Analytic_Data.csv',help="Dataset spreadsheet", required=False)
parser.add_argument('-c','--columns_to_read', default='[6, 21]',help="Which columns to read from the spreadsheet", required=False)
parser.add_argument('-b','--is_b_vec', default='True',help="Are we trying to find the b vector", required=False)
return parser
def main(args):
# Check if dataset is where we expect it to be
# pdb.set_trace()
assert os.path.isfile(args.filename), "Couldn't find the dataset at {}".format(args.filename)
filename = args.filename
all_columns = ast.literal_eval(args.columns_to_read)
# Read data from speadsheet
if filename.endswith('.xls') or filename.endswith('.xlsx'):
df = pd.read_excel(filename)
elif filename.endswith('.csv'):
df = pd.read_csv(filename)
# Convert from pandas to numpy array
full_array = df.values
array_size = full_array.shape
A = []
# Use later to figure out if we want to
first_data_ix = 100000
if args.is_b_vec == "False":
# Loop over rows of the full array
for i in range(0,array_size[0]):
if (repsInt(full_array[i,2])):
# if the 3rd column (country code) is 0, we are looking at a number for a
# state. This is what we want. Is this the case in other datasets?
# if int(full_array[i,2]) == 0:
# if math.isnan(float(full_array[i,2])):
for col_num in range(len(all_columns)):
col = all_columns[col_num]
# Check if the data is a string. If not, write directly to temp (assumes a float)
if (type(full_array[i,col]) is str):
temp_str = full_array[i,col]
# The data has commas. Remove to cast to float
try:
temp = float(temp_str.replace(',',''))
except AttributeError:
print('Trying to use replace on temp_str when it is not a str')
pdb.set_trace()
else:
temp = float(full_array[i,col])
# Make a new row in our A matrix if we are in the first entry
# Else add to existing rows
if not A or (i == first_data_ix):
A.append([temp])
first_data_ix = i
else:
A[col_num].append(temp)
# else:
# A[col_num] = 0
else:
print('else')
for i in range(0,array_size[0]):
# if i != 9:
for col_num in range(len(all_columns)):
col = all_columns[col_num]
if (type(full_array[i,col]) is str):
temp_str = full_array[i,col]
# The data has commas. Remove to cast to float
try:
temp = float(temp_str.replace(',',''))
except AttributeError:
print('Trying to use replace on temp_str when it is not a str')
pdb.set_trace()
else:
temp = float(full_array[i,col])
# Make a new row in our A matrix if we are in the first entry
# Else add to existing rows
# pdb.set_trace()
if not A or (i == first_data_ix):
A.append([temp])
first_data_ix = i
else:
A[col_num].append(temp)
# pdb.set_trace()
return A
if __name__ == '__main__':
parser = parse_args()
args = parser.parse_args()
main(args)