Skip to content

Commit

Permalink
Merge pull request #75 from CogSciUOS/sophia
Browse files Browse the repository at this point in the history
add files to preprocessing and multi-label model
  • Loading branch information
rich-pel authored Mar 26, 2020
2 parents fdda536 + faabfde commit afd76b8
Show file tree
Hide file tree
Showing 4 changed files with 413 additions and 0 deletions.
215 changes: 215 additions & 0 deletions code/labelCNN/multilabel_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
from __future__ import print_function

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import hamming_loss
#from sklearn.utils import class_weight
#import tensorflow as tf
#import cv2

import keras.backend as K
from keras.losses import binary_crossentropy
from keras.models import Sequential

from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Dropout
from keras.layers import Dense

from keras.regularizers import l1, l2

#from keras.utils import plot_model
#from keras.utils import np_utils

#from keras.callbacks import EarlyStopping

from grid import*
from submit_model import*


if __name__ == '__main__':
################################################################################
# Set the variables
################################################################################
args = typecast(sys.argv[1:])
path_to_data = args[0]
path_to_labels = args[1]

################################################################################
# Load the data
################################################################################
label_files = pd.read_csv(path_to_labels, sep=";")
RELEVANT_COLUMNS = ['is_hollow', 'has_blume', 'has_rost_head', 'has_rost_body', 'is_bended', 'is_violet']
labels = label_files[RELEVANT_COLUMNS].fillna(value = int(2))
labels = labels.astype('int32')
labels_train = labels.iloc[:12000]
labels_test = labels.iloc[12000:]
# hopefully this will create a column 'label' with all the other columns in a list
labels_train['label'] = labels_train.values.tolist()
print(labels_train.head())

# desired datatype is a list with arrays containing the 6 labels seperated by a comma
temp1 = (np.array(labels_train['label']))
train_lbl = []
for i in range(temp1.shape[0]):
temp2 = str(temp1[i])
temp3 = np.fromstring(temp2[1:-1], dtype = int, sep=',')
train_lbl.append(temp3)

train_lbl = np.array(train_lbl)
# temp1 = (np.array(labels_train['label']))
# for i in range(len(temp1)):
# temp2 = temp1[i]

# temp2_lbl = temp1_lbl[:, np.newaxis]
# train_lbl = [np.fromstring(temp2_lbl[i, 1:-1], dtype=int, sep=',') for i in range(len(temp1_lbl))]
print(" >>> train_lbl.shape = ", train_lbl.shape)
print(" >>> train_lbl at one pos = ", train_lbl[0])

imgs = np.load(path_to_data)
train_img = imgs[:12000]
test_img = imgs[12000:]
print(" >>> train_img.shape = ", train_img.shape)

################################################################################
# Build the model
################################################################################
input_shape_img = (train_img.shape[1], train_img.shape[2], train_img.shape[3])
batch_size = 32
num_epochs = 25
num_classes = 6
reg = l1(0.001)

model = Sequential()

model.add(Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=input_shape_img, kernel_regularizer=l2(0.01)))
#model.add(Dropout(0.5))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(32, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.01)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(32, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.01)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(32, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.01)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(32, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.01)))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(GlobalAveragePooling2D())
model.add(Dense(num_classes, activation='sigmoid'))

# add a costumize loss function that weights wrong labels for 1 higher than for 0 (because of class imbalance)
def weighted_loss(y_true, y_pred):
return K.mean((0.8**(1-y_true))*(1**(y_true))*K.binary_crossentropy(y_true, y_pred), axis=-1)

def hamming(y_true, y_pred):
return hamming_loss(y_true, y_pred)

def hn_multilabel_loss(y_true, y_pred):
# code snippet from https://groups.google.com/forum/#!topic/keras-users/_sjndHbejTY
# Avoid divide by 0
y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
# Multi-task loss
return K.mean(K.sum(- y_true * K.log(y_pred) - (1 - y_true) * K.log(1 - y_pred), axis=1))

def FN_wrapper():
def falseNegatives(y_true, y_pred):
neg_y_pred = 1 - y_pred
fn = K.sum(y_true * neg_y_pred)
return fn
return falseNegatives

def FP_wrapper():
def falsePositives(y_true, y_pred):
neg_y_true = 1 - y_true
fp = K.sum(neg_y_true * y_pred)
return fp
return falsePositives

def TN_wrapper():
def trueNegatives(y_true, y_pred):
neg_y_true = 1 - y_true
neg_y_pred = 1 - y_pred
tn = K.sum(neg_y_true * neg_y_pred)
return tn
return trueNegatives

def TP_wrapper():
def truePositives(y_true, y_pred):
tp = K.sum(y_true * y_pred)
return tp
return truePositives

FN = FN_wrapper()
FP = FP_wrapper()
TN = TN_wrapper()
TP = TP_wrapper()

model.compile(#loss=weighted_loss,
loss='binary_crossentropy',
#loss = hn_multilabel_loss,
optimizer='adam',
metrics=['accuracy', FN, FP, TN, TP])

model.summary()

################################################################################
# Train the model
################################################################################
#early_stop = EarlyStopping(monitor='loss', patience=5, verbose=1)
#class_weights = class_weight.compute_sample_weight(class_weight = "balanced", y = train_lbl)
history = model.fit(train_img, train_lbl,
batch_size=batch_size,
epochs=num_epochs,
verbose=1,
#class_weight=class_weights, #{0:5, 1:3, 2:2 ,3:2 ,4:1 ,5:3},
validation_split=0.1)
#callbacks=[early_stop])
print(history.history)
################################################################################
# Check the history
################################################################################
plt.figure(facecolor='white')

# accuracy ---------------------------------------------------------------------
ax1 = plt.subplot(2,1,1)

plt.plot([x * 100 for x in history.history['acc']], label="acc", color="blue")
plt.plot([x * 100 for x in history.history['val_acc']], label="val_acc", color="red")

plt.title('Accuracy History')
plt.ylabel('accuracy')
# plt.xlabel('epoch')

plt.legend(['train', 'valid'], loc='lower right')

plt.ylim(0, 1)
plt.xticks(np.arange(0, num_epochs + 1, 5))
plt.yticks(np.arange(0, 100.1, 10))
ax1.yaxis.set_major_formatter(plt.FuncFormatter('{:.0f}%'.format))
plt.grid()

# loss -------------------------------------------------------------------------
plt.subplot(2,1,2)

plt.plot(history.history['loss'], label="loss", color="blue")
plt.plot(history.history['val_loss'], label="val_loss", color="red")

plt.title('Loss History')
plt.ylabel('loss')
plt.xlabel('epoch')

plt.legend(['train', 'valid'], loc='lower left')

plt.ylim(0)
plt.xticks(np.arange(0, num_epochs + 1, 5))
plt.grid()
plt.show()
plt.savefig('/net/projects/scratch/winter/valid_until_31_July_2020/asparagus/sophia/asparagus/code/get_data/fig_l2.png')
model.save('/net/projects/scratch/winter/valid_until_31_July_2020/asparagus/sophia/asparagus/code/get_data/l2.h5')

# convert the history.history dict to a pandas DataFrame
hist_df = pd.DataFrame(history.history)

# and save to csv
hist_csv_file = 'history_l2.csv'
with open(hist_csv_file, mode='w') as f:
hist_df.to_csv(f)
36 changes: 36 additions & 0 deletions code/preprocess/combine_npy_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import numpy as np
import os
from grid import*
import sys

def combine(PATH):
'''
Combine the stacked npy files into one data set and downsample them by only taking every 6th pixel.
We used every 6th pixel because it seems like a downscale that is still good enough to see most details.
One could also downscale more or less or with a different technique.
Args: Path to stacked npy files
Out: None (just save the dataset)
'''
all_files = os.listdir(PATH)
n = len(all_files)
# load first image to get dimensionality and dtype dynamically
first = np.load(PATH + all_files[0])[::6,::6] #change here to downscale differently
dtype = first.dtype
l, w, d = first.shape
# make some space for the dataset
data = np.empty((n, l, w, d), dtype=dtype)
# load all files and save them in the corresponding position in the data array
for i,file in enumerate(all_files):
data[i,:,:,:] = np.load(PATH + file)[::6,::6] #change here to downscale differently
# print how far along we are
if i%500==0:
print(i)
# save dataset
path_out = PATH + "data_horizontal_noB.npy"
np.save(path_out, data)

if __name__ == '__main__':
args = typecast(sys.argv[1:])
path = args[0]
combine(path)
63 changes: 63 additions & 0 deletions code/preprocess/move_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
'''
This script takes all asparagus IDs from a csv file (in this case all IDs that we hand labeled) and moves the corresponding image files to the desired location.
'''

import pandas as pd
import numpy as np
import os
from grid import*
import sys
import shutil

def get_asparagus_ids(PATH):
'''
Get ids of the asparagus pieces that have been labeled so far.
Args: path to the combined.csv file that contains all labelfiles
Out: only the ids aka the first column
'''
# read in the file
csvs = pd.read_csv(PATH, sep = ';')
# the column corresponding to the ids
ids = csvs['id']
# make it a numpy array for better parsing
ids = np.array(ids)
return ids

def get_files(PATH):
'''
Get all file names in directories and subdirectories.
Args: PATH to files
Out: List of all file names and the corresponding directories
'''
all_files = []
file_names = []
for subdir, dirs, files in os.walk(PATH):
for file in files:
filepath = subdir + '/' + file
if filepath.endswith(".png"):
all_files.append(filepath)
file_names.append(int(file[:-6])) #modified to not save without _a.png and save as int for later comparison
return all_files, file_names

if __name__ == '__main__':
args = typecast(sys.argv[1:])

path_to_imgs = args[0]
path_to_csv = args[1]
path_to_save = args[2]
# read ids from combined.csv
ids = get_asparagus_ids(path_to_csv)
print('#ids: ' + str(len(ids)))
# get all images file names and corresponding paths
file_paths, file_names = get_files(path_to_imgs)
print('#files found: ' + str(len(file_names)))
files = np.array(file_names)
print(files[0:10])
index_list = []
for item in ids:
item_index = np.where(files==item)
for idx in item_index[0]:
shutil.copy(file_paths[int(idx)], path_to_save)
# to check whether all were found count the number of found indices
index_list.append(item_index)
print('#indices found: ' + str(len(index_list)))
Loading

0 comments on commit afd76b8

Please sign in to comment.