-
Notifications
You must be signed in to change notification settings - Fork 35
/
Copy pathExperiment.m
313 lines (265 loc) · 13.2 KB
/
Experiment.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
classdef Experiment < handle
%EXPERIMENT creates an experiment to run an ORCA's experiment which
% consist on optimising and running a method in fold (a pair of train-test
% dataset partition). Theexperiment is described by a configuration file.
% This class is used by Utilities to launch a set of experiments
%
% EXPERIMENT properties:
% data - DataSet object to store the train/test data
% method - Method to learn and classify data
% cvCriteria - Metric to guide the grid search for parameters optimisation
% resultsDir - Directory to store performance reports and learned models
% seed - Seed to be used for random number generation
% crossvalide - Activate corssvalidation
%
% EXPERIMENT methods:
% launch - Launch experiment described in file
%
% This file is part of ORCA: https://github.com/ayrna/orca
% Original authors: Pedro Antonio Gutiérrez, María Pérez Ortiz, Javier Sánchez Monedero
% Citation: If you use this code, please cite the associated paper http://www.uco.es/grupos/ayrna/orreview
% Copyright:
% This software is released under the The GNU General Public License v3.0 licence
% available at http://www.gnu.org/licenses/gpl-3.0.html
%
properties
data = DataSet;
method = Algorithm;
cvCriteria = MAE;
crossvalide = 0;
resultsDir = '';
seed = 1;
parameters; % parameters to optimize
end
properties (SetAccess = private)
logsDir
end
methods
function obj = launch(obj,expFile)
% LAUNCH Launch experiment described in file.
% EXPOBJ = LAUNCH(EXPFILE) parses EXPFILE and run the experiment
% described on it. It performs the following steps:
% # Preprocess data cleaning and standardization (option need to be actived in configuration file)
% # Optimize parameters by performing a grid search (if selected
% in configuration file)
% # Run algorithm with optimal parameters (if crossvalidation was
% selected)
% # Save experiment results for the fold
obj.process(expFile);
obj.run();
end
end
methods(Access = private)
function obj = run(obj)
% RUN do experiment steps: data cleaning and standardization, parameters
% optimization and save results
[train,test] = obj.data.preProcessData();
if obj.crossvalide
c1 = clock;
Optimals = obj.crossValideParams(train);
c2 = clock;
crossvaltime = etime(c2,c1);
totalResults = obj.method.runAlgorithm(train, test, Optimals);
totalResults.crossvaltime = crossvaltime;
else
totalResults = obj.method.runAlgorithm(train, test);
end
obj.saveResults(totalResults);
end
function obj = process(obj,fname)
% PROCESS parses experiment described in FNAME
cObj = Config(fname);
expObj = cObj.exps{:};
% Copy ini values to corresponding object properties
% General experiment properties
% TODO: check robustness and document behaviour of ini file
if expObj.general.isKey('num_folds')
obj.data.nOfFolds = str2num(expObj.general('num_folds'));
end
if expObj.general.isKey('standarize')
obj.data.standarize = str2num(expObj.general('standarize'));
end
if expObj.general.isKey('cvmetric')
met = upper(expObj.general('cvmetric'));
eval(['obj.cvCriteria = ' met ';']);
end
if expObj.general.isKey('seed')
obj.seed = str2num(expObj.general('seed'));
end
try
obj.data.directory = expObj.general('directory');
obj.data.train = expObj.general('train');
obj.data.test = expObj.general('test');
obj.resultsDir = expObj.general('results');
catch ME
error('Configuration file %s does not have mininum fields. Exception %s', fname, ME.identifier)
end
% Algorithm properties are transformed to varargs ('key',value)
varargs = obj.mapsToCell(expObj.algorithm);
alg = expObj.algorithm('algorithm');
obj.method = feval(alg, varargs);
% Parameters to be optimized
if ~isempty(expObj.params)
pkeys = expObj.params.keys;
for p=1:cast(expObj.params.Count,'int32')
%isfield(obj.parameters.' pkeys{p})
eval(['obj.parameters.' pkeys{p} ' = [' expObj.params(pkeys{p}) '];']);
obj.crossvalide = 1;
end
end
end
function obj = saveResults(obj,TotalResults)
% SAVERESULTS saves the results of the experiment and
% the best hyperparameters.
par = obj.method.getParameterNames();
if ~isempty(par)
outputFile = [obj.resultsDir filesep 'OptHyperparams' filesep obj.data.dataname ];
fid = fopen(outputFile,'w');
for i=1:(numel(par))
value = getfield(TotalResults.model.parameters,par{i});
fprintf(fid,'%s,%f\n', par{i},value);
end
fclose(fid);
end
outputFile = [obj.resultsDir filesep 'Times' filesep obj.data.dataname ];
fid = fopen(outputFile,'w');
if obj.crossvalide
fprintf(fid, '%f\n%f\n%f', TotalResults.trainTime, TotalResults.testTime, TotalResults.crossvaltime);
else
fprintf(fid, '%f\n%f\n%f', TotalResults.trainTime, TotalResults.testTime, 0);
end
fclose(fid);
outputFile = [obj.resultsDir filesep 'Predictions' filesep obj.data.train ];
dlmwrite(outputFile, TotalResults.predictedTrain);
outputFile = [obj.resultsDir filesep 'Predictions' filesep obj.data.test ];
dlmwrite(outputFile, TotalResults.predictedTest);
model = TotalResults.model;
% Write complete model
outputFile = [obj.resultsDir filesep 'Models' filesep obj.data.dataname '.mat'];
save(outputFile, 'model');
outputFile = [obj.resultsDir filesep 'Guess' filesep obj.data.train ];
dlmwrite(outputFile, TotalResults.projectedTrain, 'precision', '%.15f');
outputFile = [obj.resultsDir filesep 'Guess' filesep obj.data.test ];
dlmwrite(outputFile, TotalResults.projectedTest, 'precision', '%.15f');
end
function optimals = crossValideParams(obj,train)
% CROSSVALIDE Function for performing the crossvalidation in a specific train partition.
%
% OPTIMALS = CROSSVALIDEPARAMS(TRAIN) Divides each the data in k-folds
% (k defined by 'num fold' in configuration file). Returns vector OPTIMALS
% with optimal parameter(s)
nOfFolds = obj.data.nOfFolds;
%parameters = obj.parameters;
%par = fieldnames(parameters);
sets = struct2cell(obj.parameters);
name_parameters = fieldnames(obj.parameters);
nParam = numel(name_parameters);
c = cell(1, numel(sets));
[c{:}] = ndgrid( sets{:} );
combinations = cell2mat( cellfun(@(v)v(:), c, 'UniformOutput',false) );
combinations = combinations';
% Avoid problems with very low number of patterns for some
% classes
uniqueTargets = unique(train.targets);
nOfPattPerClass = sum(repmat(train.targets,1,size(uniqueTargets,1))==repmat(uniqueTargets',size(train.targets,1),1));
for i=1:size(uniqueTargets,1)
if(nOfPattPerClass(i)==1)
train.patterns = [train.patterns; train.patterns(train.targets==uniqueTargets(i),:)];
train.targets = [train.targets; train.targets(train.targets==uniqueTargets(i),:)];
[train.targets,idx] = sort(train.targets);
train.patterns = train.patterns(idx,:);
end
end
% Use the seed
if (exist ('OCTAVE_VERSION', 'builtin') > 0)
rand('seed',obj.seed);
else
s = RandStream.create('mt19937ar','seed',obj.seed);
if verLessThan('matlab','8.0')
RandStream.setDefaultStream(s);
else
RandStream.setGlobalStream(s);
end
end
if (exist ('OCTAVE_VERSION', 'builtin') > 0)
pkg load statistics;
CVO = cvpartition(train.targets,'KFold',nOfFolds);
numTests = get(CVO,'NumTestSets');
else
CVO = cvpartition(train.targets,'k',nOfFolds);
numTests = CVO.NumTestSets;
end
result = zeros(numTests,size(combinations,2));
% Foreach fold
for ff = 1:numTests
% Build fold dataset
if (exist ('OCTAVE_VERSION', 'builtin') > 0)
trIdx = training(CVO,ff);
teIdx = test(CVO,ff);
else
trIdx = CVO.training(ff);
teIdx = CVO.test(ff);
end
auxTrain.targets = train.targets(trIdx,:);
auxTrain.patterns = train.patterns(trIdx,:);
auxTest.targets = train.targets(teIdx,:);
auxTest.patterns = train.patterns(teIdx,:);
for i=1:size(combinations,2)
% Extract the combination of parameters
currentCombination = combinations(:,i);
if nParam~= 0
currentCombination = reshape(currentCombination,[1,nParam]);
param = cell2struct(num2cell(currentCombination(1:nParam)),name_parameters,2);
else
param = [];
end
model = obj.method.runAlgorithm(auxTrain, auxTest, param);
if strcmp(obj.cvCriteria.name,'Area under curve')
result(ff,i) = obj.cvCriteria.calculateCrossvalMetric(auxTest.targets, model.projectedTest);
else
result(ff,i) = obj.cvCriteria.calculateCrossvalMetric(auxTest.targets, model.predictedTest);
end
end
end
if (exist ('OCTAVE_VERSION', 'builtin') > 0)
pkg unload statistics;
end
[bestValue,bestIdx] = min(mean(result));
optimalCombination = combinations(:,bestIdx);
if nParam~= 0
optimalCombination = reshape(optimalCombination,[1,nParam]);
optimals = cell2struct(num2cell(optimalCombination(1:nParam)),name_parameters,2);
else
optimals = [];
end
end
end
methods (Static = true)
function varargs = mapsToCell(aObj)
%varargs = mapsToCell(mapObj) returns key value pairs in a comma separated
% string. Example: "'kernel', 'rbf', 'c', 0.1"
% If there are no parameters return empty cell
if aObj.Count == 1
varargs = cell(1,1);
return
end
mapObj = containers.Map(aObj.keys,aObj.values);
mapObj.remove('algorithm');
pkeys = mapObj.keys;
varargs = cell(1,cast(mapObj.Count,'int32')*2);
for p=1:2:cast(mapObj.Count*2,'int32')
keyasstr = pkeys(p);
keyasstr = keyasstr{:};
value = mapObj(keyasstr);
varargs{1,p} = sprintf('%s', pkeys{p});
% Check numerical values
valuenum = str2double(value);
if isnan(valuenum) % we have a string
varargs{1,p+1} = sprintf('%s', value);
else % we have a number
varargs{1,p+1} = valuenum;
end
end
end
end
end