-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #75 from CogSciUOS/sophia
add files to preprocessing and multi-label model
- Loading branch information
Showing
4 changed files
with
413 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,215 @@ | ||
from __future__ import print_function | ||
|
||
import matplotlib.pyplot as plt | ||
import pandas as pd | ||
import numpy as np | ||
from sklearn.metrics import hamming_loss | ||
#from sklearn.utils import class_weight | ||
#import tensorflow as tf | ||
#import cv2 | ||
|
||
import keras.backend as K | ||
from keras.losses import binary_crossentropy | ||
from keras.models import Sequential | ||
|
||
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Dropout | ||
from keras.layers import Dense | ||
|
||
from keras.regularizers import l1, l2 | ||
|
||
#from keras.utils import plot_model | ||
#from keras.utils import np_utils | ||
|
||
#from keras.callbacks import EarlyStopping | ||
|
||
from grid import* | ||
from submit_model import* | ||
|
||
|
||
if __name__ == '__main__': | ||
################################################################################ | ||
# Set the variables | ||
################################################################################ | ||
args = typecast(sys.argv[1:]) | ||
path_to_data = args[0] | ||
path_to_labels = args[1] | ||
|
||
################################################################################ | ||
# Load the data | ||
################################################################################ | ||
label_files = pd.read_csv(path_to_labels, sep=";") | ||
RELEVANT_COLUMNS = ['is_hollow', 'has_blume', 'has_rost_head', 'has_rost_body', 'is_bended', 'is_violet'] | ||
labels = label_files[RELEVANT_COLUMNS].fillna(value = int(2)) | ||
labels = labels.astype('int32') | ||
labels_train = labels.iloc[:12000] | ||
labels_test = labels.iloc[12000:] | ||
# hopefully this will create a column 'label' with all the other columns in a list | ||
labels_train['label'] = labels_train.values.tolist() | ||
print(labels_train.head()) | ||
|
||
# desired datatype is a list with arrays containing the 6 labels seperated by a comma | ||
temp1 = (np.array(labels_train['label'])) | ||
train_lbl = [] | ||
for i in range(temp1.shape[0]): | ||
temp2 = str(temp1[i]) | ||
temp3 = np.fromstring(temp2[1:-1], dtype = int, sep=',') | ||
train_lbl.append(temp3) | ||
|
||
train_lbl = np.array(train_lbl) | ||
# temp1 = (np.array(labels_train['label'])) | ||
# for i in range(len(temp1)): | ||
# temp2 = temp1[i] | ||
|
||
# temp2_lbl = temp1_lbl[:, np.newaxis] | ||
# train_lbl = [np.fromstring(temp2_lbl[i, 1:-1], dtype=int, sep=',') for i in range(len(temp1_lbl))] | ||
print(" >>> train_lbl.shape = ", train_lbl.shape) | ||
print(" >>> train_lbl at one pos = ", train_lbl[0]) | ||
|
||
imgs = np.load(path_to_data) | ||
train_img = imgs[:12000] | ||
test_img = imgs[12000:] | ||
print(" >>> train_img.shape = ", train_img.shape) | ||
|
||
################################################################################ | ||
# Build the model | ||
################################################################################ | ||
input_shape_img = (train_img.shape[1], train_img.shape[2], train_img.shape[3]) | ||
batch_size = 32 | ||
num_epochs = 25 | ||
num_classes = 6 | ||
reg = l1(0.001) | ||
|
||
model = Sequential() | ||
|
||
model.add(Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=input_shape_img, kernel_regularizer=l2(0.01))) | ||
#model.add(Dropout(0.5)) | ||
model.add(MaxPooling2D(pool_size=(2, 2))) | ||
model.add(Conv2D(32, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.01))) | ||
model.add(MaxPooling2D(pool_size=(2, 2))) | ||
model.add(Conv2D(32, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.01))) | ||
model.add(MaxPooling2D(pool_size=(2, 2))) | ||
model.add(Conv2D(32, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.01))) | ||
model.add(MaxPooling2D(pool_size=(2, 2))) | ||
model.add(Conv2D(32, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.01))) | ||
model.add(MaxPooling2D(pool_size=(2, 2))) | ||
|
||
model.add(GlobalAveragePooling2D()) | ||
model.add(Dense(num_classes, activation='sigmoid')) | ||
|
||
# add a costumize loss function that weights wrong labels for 1 higher than for 0 (because of class imbalance) | ||
def weighted_loss(y_true, y_pred): | ||
return K.mean((0.8**(1-y_true))*(1**(y_true))*K.binary_crossentropy(y_true, y_pred), axis=-1) | ||
|
||
def hamming(y_true, y_pred): | ||
return hamming_loss(y_true, y_pred) | ||
|
||
def hn_multilabel_loss(y_true, y_pred): | ||
# code snippet from https://groups.google.com/forum/#!topic/keras-users/_sjndHbejTY | ||
# Avoid divide by 0 | ||
y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon()) | ||
# Multi-task loss | ||
return K.mean(K.sum(- y_true * K.log(y_pred) - (1 - y_true) * K.log(1 - y_pred), axis=1)) | ||
|
||
def FN_wrapper(): | ||
def falseNegatives(y_true, y_pred): | ||
neg_y_pred = 1 - y_pred | ||
fn = K.sum(y_true * neg_y_pred) | ||
return fn | ||
return falseNegatives | ||
|
||
def FP_wrapper(): | ||
def falsePositives(y_true, y_pred): | ||
neg_y_true = 1 - y_true | ||
fp = K.sum(neg_y_true * y_pred) | ||
return fp | ||
return falsePositives | ||
|
||
def TN_wrapper(): | ||
def trueNegatives(y_true, y_pred): | ||
neg_y_true = 1 - y_true | ||
neg_y_pred = 1 - y_pred | ||
tn = K.sum(neg_y_true * neg_y_pred) | ||
return tn | ||
return trueNegatives | ||
|
||
def TP_wrapper(): | ||
def truePositives(y_true, y_pred): | ||
tp = K.sum(y_true * y_pred) | ||
return tp | ||
return truePositives | ||
|
||
FN = FN_wrapper() | ||
FP = FP_wrapper() | ||
TN = TN_wrapper() | ||
TP = TP_wrapper() | ||
|
||
model.compile(#loss=weighted_loss, | ||
loss='binary_crossentropy', | ||
#loss = hn_multilabel_loss, | ||
optimizer='adam', | ||
metrics=['accuracy', FN, FP, TN, TP]) | ||
|
||
model.summary() | ||
|
||
################################################################################ | ||
# Train the model | ||
################################################################################ | ||
#early_stop = EarlyStopping(monitor='loss', patience=5, verbose=1) | ||
#class_weights = class_weight.compute_sample_weight(class_weight = "balanced", y = train_lbl) | ||
history = model.fit(train_img, train_lbl, | ||
batch_size=batch_size, | ||
epochs=num_epochs, | ||
verbose=1, | ||
#class_weight=class_weights, #{0:5, 1:3, 2:2 ,3:2 ,4:1 ,5:3}, | ||
validation_split=0.1) | ||
#callbacks=[early_stop]) | ||
print(history.history) | ||
################################################################################ | ||
# Check the history | ||
################################################################################ | ||
plt.figure(facecolor='white') | ||
|
||
# accuracy --------------------------------------------------------------------- | ||
ax1 = plt.subplot(2,1,1) | ||
|
||
plt.plot([x * 100 for x in history.history['acc']], label="acc", color="blue") | ||
plt.plot([x * 100 for x in history.history['val_acc']], label="val_acc", color="red") | ||
|
||
plt.title('Accuracy History') | ||
plt.ylabel('accuracy') | ||
# plt.xlabel('epoch') | ||
|
||
plt.legend(['train', 'valid'], loc='lower right') | ||
|
||
plt.ylim(0, 1) | ||
plt.xticks(np.arange(0, num_epochs + 1, 5)) | ||
plt.yticks(np.arange(0, 100.1, 10)) | ||
ax1.yaxis.set_major_formatter(plt.FuncFormatter('{:.0f}%'.format)) | ||
plt.grid() | ||
|
||
# loss ------------------------------------------------------------------------- | ||
plt.subplot(2,1,2) | ||
|
||
plt.plot(history.history['loss'], label="loss", color="blue") | ||
plt.plot(history.history['val_loss'], label="val_loss", color="red") | ||
|
||
plt.title('Loss History') | ||
plt.ylabel('loss') | ||
plt.xlabel('epoch') | ||
|
||
plt.legend(['train', 'valid'], loc='lower left') | ||
|
||
plt.ylim(0) | ||
plt.xticks(np.arange(0, num_epochs + 1, 5)) | ||
plt.grid() | ||
plt.show() | ||
plt.savefig('/net/projects/scratch/winter/valid_until_31_July_2020/asparagus/sophia/asparagus/code/get_data/fig_l2.png') | ||
model.save('/net/projects/scratch/winter/valid_until_31_July_2020/asparagus/sophia/asparagus/code/get_data/l2.h5') | ||
|
||
# convert the history.history dict to a pandas DataFrame | ||
hist_df = pd.DataFrame(history.history) | ||
|
||
# and save to csv | ||
hist_csv_file = 'history_l2.csv' | ||
with open(hist_csv_file, mode='w') as f: | ||
hist_df.to_csv(f) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import numpy as np | ||
import os | ||
from grid import* | ||
import sys | ||
|
||
def combine(PATH): | ||
''' | ||
Combine the stacked npy files into one data set and downsample them by only taking every 6th pixel. | ||
We used every 6th pixel because it seems like a downscale that is still good enough to see most details. | ||
One could also downscale more or less or with a different technique. | ||
Args: Path to stacked npy files | ||
Out: None (just save the dataset) | ||
''' | ||
all_files = os.listdir(PATH) | ||
n = len(all_files) | ||
# load first image to get dimensionality and dtype dynamically | ||
first = np.load(PATH + all_files[0])[::6,::6] #change here to downscale differently | ||
dtype = first.dtype | ||
l, w, d = first.shape | ||
# make some space for the dataset | ||
data = np.empty((n, l, w, d), dtype=dtype) | ||
# load all files and save them in the corresponding position in the data array | ||
for i,file in enumerate(all_files): | ||
data[i,:,:,:] = np.load(PATH + file)[::6,::6] #change here to downscale differently | ||
# print how far along we are | ||
if i%500==0: | ||
print(i) | ||
# save dataset | ||
path_out = PATH + "data_horizontal_noB.npy" | ||
np.save(path_out, data) | ||
|
||
if __name__ == '__main__': | ||
args = typecast(sys.argv[1:]) | ||
path = args[0] | ||
combine(path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
''' | ||
This script takes all asparagus IDs from a csv file (in this case all IDs that we hand labeled) and moves the corresponding image files to the desired location. | ||
''' | ||
|
||
import pandas as pd | ||
import numpy as np | ||
import os | ||
from grid import* | ||
import sys | ||
import shutil | ||
|
||
def get_asparagus_ids(PATH): | ||
''' | ||
Get ids of the asparagus pieces that have been labeled so far. | ||
Args: path to the combined.csv file that contains all labelfiles | ||
Out: only the ids aka the first column | ||
''' | ||
# read in the file | ||
csvs = pd.read_csv(PATH, sep = ';') | ||
# the column corresponding to the ids | ||
ids = csvs['id'] | ||
# make it a numpy array for better parsing | ||
ids = np.array(ids) | ||
return ids | ||
|
||
def get_files(PATH): | ||
''' | ||
Get all file names in directories and subdirectories. | ||
Args: PATH to files | ||
Out: List of all file names and the corresponding directories | ||
''' | ||
all_files = [] | ||
file_names = [] | ||
for subdir, dirs, files in os.walk(PATH): | ||
for file in files: | ||
filepath = subdir + '/' + file | ||
if filepath.endswith(".png"): | ||
all_files.append(filepath) | ||
file_names.append(int(file[:-6])) #modified to not save without _a.png and save as int for later comparison | ||
return all_files, file_names | ||
|
||
if __name__ == '__main__': | ||
args = typecast(sys.argv[1:]) | ||
|
||
path_to_imgs = args[0] | ||
path_to_csv = args[1] | ||
path_to_save = args[2] | ||
# read ids from combined.csv | ||
ids = get_asparagus_ids(path_to_csv) | ||
print('#ids: ' + str(len(ids))) | ||
# get all images file names and corresponding paths | ||
file_paths, file_names = get_files(path_to_imgs) | ||
print('#files found: ' + str(len(file_names))) | ||
files = np.array(file_names) | ||
print(files[0:10]) | ||
index_list = [] | ||
for item in ids: | ||
item_index = np.where(files==item) | ||
for idx in item_index[0]: | ||
shutil.copy(file_paths[int(idx)], path_to_save) | ||
# to check whether all were found count the number of found indices | ||
index_list.append(item_index) | ||
print('#indices found: ' + str(len(index_list))) |
Oops, something went wrong.