forked from bnsreenu/python_for_microscopists
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path148_imbalanced_data_DeepLearning.py
198 lines (140 loc) · 6.58 KB
/
148_imbalanced_data_DeepLearning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# https://youtu.be/vOBbKNwi6Go
"""
@author: Sreenivas Bhattiprolu
Random Forest performs well on imbalanced datasets
because of its hierarchical structure allows it to learn signals from all classes.
"""
import numpy as np
import cv2
import pandas as pd
img = cv2.imread('images/Sandstone_Versa0180_image.tif')
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
#Save original image pixels into a data frame. This is our Feature #1.
img2 = img.reshape(-1)
df = pd.DataFrame()
df['Original Image'] = img2
#Generate Gabor features
num = 1 #To count numbers up in order to give Gabor features a lable in the data frame
kernels = []
for theta in range(2): #Define number of thetas
theta = theta / 4. * np.pi
for sigma in (1, 3): #Sigma with 1 and 3
for lamda in np.arange(0, np.pi, np.pi / 4): #Range of wavelengths
for gamma in (0.05, 0.5): #Gamma values of 0.05 and 0.5
gabor_label = 'Gabor' + str(num) #Label Gabor columns as Gabor1, Gabor2, etc.
# print(gabor_label)
ksize=9
kernel = cv2.getGaborKernel((ksize, ksize), sigma, theta, lamda, gamma, 0, ktype=cv2.CV_32F)
kernels.append(kernel)
#Now filter the image and add values to a new column
fimg = cv2.filter2D(img2, cv2.CV_8UC3, kernel)
filtered_img = fimg.reshape(-1)
df[gabor_label] = filtered_img #Labels columns as Gabor1, Gabor2, etc.
print(gabor_label, ': theta=', theta, ': sigma=', sigma, ': lamda=', lamda, ': gamma=', gamma)
num += 1 #Increment for gabor column label
#Now, add a column in the data frame for the Labels
#For this, we need to import the labeled image
labeled_img = cv2.imread('images/Sandstone_Versa0180_mask.png')
#Remember that you can load an image with partial labels
#But, drop the rows with unlabeled data
labeled_img = cv2.cvtColor(labeled_img, cv2.COLOR_BGR2GRAY)
labeled_img1 = labeled_img.reshape(-1)
df['Labels'] = labeled_img1
print(df.head())
print(df.Labels.unique()) #Look at the labels in our dataframe
print(df['Labels'].value_counts())
#Let us drop background pixels. In this example they have a value 33
# Normally you may want to also drop unlabeled pixels with value 0
#df = df[df.Labels != 33]
#print(df['Labels'].value_counts())
#Still label 231 is over-represented and label 65 is underrepresented
#########################################################
#Define the dependent variable that needs to be predicted (labels)
Y = df["Labels"].values
#Define the independent variables
X = df.drop(labels = ["Labels"], axis=1)
X = X.iloc[:, :].as_matrix() #Convert X from pandas dataframe to Numpy array
#Split data into train and test to verify accuracy after fitting the model.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=42)
#################################################################
############################################################################
#For deep learning....
# create model
from keras.models import Sequential
from keras.layers import Dense
model = Sequential()
model.add(Dense(8, input_dim=33, activation='relu'))
#model.add(Dense(8, activation='relu'))
model.add(Dense(4, activation='softmax'))
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print(model.summary())
from keras.utils import normalize, to_categorical
from sklearn.preprocessing import LabelEncoder
X_train_deep = normalize(X_train, axis=1)
X_test_deep = normalize(X_test, axis=1)
#Encode Y labels to 0, 1, 2, 3, then one hot encode.
le = LabelEncoder()
le.fit(y_train)
y_train_deep = le.transform(y_train)
y_train_deep_cat = to_categorical(y_train_deep)
y_test_deep = le.transform(y_test)
y_test_deep_cat = to_categorical(y_test_deep)
#Automatically define class weights
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced', #Can also give a dictionary
np.unique(y_train_deep), #Classes
y_train_deep) #Y
#Manually define class weights
#Try all equal weights first then try to bring all to about 30k using appropriate weights
#NOTE: AFter encoding: 33=0, 65=1 (under represented), 201=2, 231=3 (overrep.)
#print(df['Labels'].value_counts())
values, counts = np.unique(y_train_deep, return_counts=True)
print(values, counts)
class_weights_manual = {0: 0.1,
1: 1.6,
2: 0.108,
3: 0.015}
#class_weights_manual = {0: 0.1,
# 1: 0.1,
# 2: 0.1,
# 3: 0.1}
history = model.fit(X_train_deep, y_train_deep_cat, epochs=10,
class_weight=class_weights_manual, validation_data = (X_test_deep, y_test_deep_cat))
from matplotlib import pyplot as plt
#plot the training and validation accuracy and loss at each epoch
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'y', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
acc = history.history['acc']
val_acc = history.history['val_acc']
plt.plot(epochs, acc, 'y', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()
prediction_deep = model.predict(X_test_deep)
# evaluate the keras model
_, accuracy = model.evaluate(X_test_deep, y_test_deep_cat, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))
mythreshold=0.5
y_pred_thresh = (prediction_deep >= mythreshold).astype(int)
y_pred_thresholded=np.argmax(y_pred_thresh, axis=-1)
#y_pred_thresholded = y_pred_thresholded.argmax(1)
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test_deep, y_pred_thresholded)
print(cm)
print("Pixel 33 accuracy = ", cm[0,0] / (cm[0,0]+cm[1,0]+cm[2,0]+cm[3,0]))
print("Pixel 65 accuracy = ", cm[1,1] / (cm[0,1]+cm[1,1]+cm[2,1]+cm[3,1]))
print("Pixel 201 accuracy = ", cm[2,2] / (cm[0,2]+cm[1,2]+cm[2,2]+cm[3,2]))
print("Pixel 231 accuracy = ", cm[3,3] / (cm[0,3]+cm[1,3]+cm[2,3]+cm[3,3]))