Skip to content

Commit dabbc91

Browse files
anmikhNoiredd
authored andcommitted
Added Swish layer (#6002)
* added swish layer (cpu) * swish layer: added tests * swish layer: optimized backpropogation * swish layer: added cuda implementation * swish layer: added beta parameter * swish layer: incorporated sigmoid layer * swish layer: fix comment of last added parameter * swish layer: added REGISTER_LAYER_CLASS
1 parent f049522 commit dabbc91

File tree

5 files changed

+308
-1
lines changed

5 files changed

+308
-1
lines changed

include/caffe/layers/swish_layer.hpp

+96
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#ifndef CAFFE_SWISH_LAYER_HPP_
2+
#define CAFFE_SWISH_LAYER_HPP_
3+
4+
#include <vector>
5+
6+
#include "caffe/blob.hpp"
7+
#include "caffe/layer.hpp"
8+
#include "caffe/proto/caffe.pb.h"
9+
10+
#include "caffe/layers/neuron_layer.hpp"
11+
#include "caffe/layers/sigmoid_layer.hpp"
12+
13+
namespace caffe {
14+
15+
/**
16+
* @brief Swish non-linearity @f$ y = x \sigma (\beta x) @f$.
17+
* A novel activation function that tends to work better than ReLU [1].
18+
*
19+
* [1] Prajit Ramachandran, Barret Zoph, Quoc V. Le. "Searching for
20+
* Activation Functions". arXiv preprint arXiv:1710.05941v2 (2017).
21+
*/
22+
template <typename Dtype>
23+
class SwishLayer : public NeuronLayer<Dtype> {
24+
public:
25+
/**
26+
* @param param provides SwishParameter swish_param,
27+
* with SwishLayer options:
28+
* - beta (\b optional, default 1).
29+
* the value @f$ \beta @f$ in the @f$ y = x \sigma (\beta x) @f$.
30+
*/
31+
explicit SwishLayer(const LayerParameter& param)
32+
: NeuronLayer<Dtype>(param),
33+
sigmoid_layer_(new SigmoidLayer<Dtype>(param)),
34+
sigmoid_input_(new Blob<Dtype>()),
35+
sigmoid_output_(new Blob<Dtype>()) {}
36+
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
37+
const vector<Blob<Dtype>*>& top);
38+
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
39+
const vector<Blob<Dtype>*>& top);
40+
41+
virtual inline const char* type() const { return "Swish"; }
42+
43+
protected:
44+
/**
45+
* @param bottom input Blob vector (length 1)
46+
* -# @f$ (N \times C \times H \times W) @f$
47+
* the inputs @f$ x @f$
48+
* @param top output Blob vector (length 1)
49+
* -# @f$ (N \times C \times H \times W) @f$
50+
* the computed outputs @f$
51+
* y = x \sigma (\beta x)
52+
* @f$.
53+
*/
54+
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
55+
const vector<Blob<Dtype>*>& top);
56+
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
57+
const vector<Blob<Dtype>*>& top);
58+
59+
/**
60+
* @brief Computes the error gradient w.r.t. the sigmoid inputs.
61+
*
62+
* @param top output Blob vector (length 1), providing the error gradient with
63+
* respect to the outputs
64+
* -# @f$ (N \times C \times H \times W) @f$
65+
* containing error gradients @f$ \frac{\partial E}{\partial y} @f$
66+
* with respect to computed outputs @f$ y @f$
67+
* @param propagate_down see Layer::Backward.
68+
* @param bottom input Blob vector (length 1)
69+
* -# @f$ (N \times C \times H \times W) @f$
70+
* the inputs @f$ x @f$; Backward fills their diff with
71+
* gradients @f$
72+
* \frac{\partial E}{\partial x}
73+
* = \frac{\partial E}{\partial y}(\beta y +
74+
* \sigma (\beta x)(1 - \beta y))
75+
* @f$ if propagate_down[0]
76+
*/
77+
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
78+
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
79+
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
80+
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
81+
82+
/// The internal SigmoidLayer
83+
shared_ptr<SigmoidLayer<Dtype> > sigmoid_layer_;
84+
/// sigmoid_input_ stores the input of the SigmoidLayer.
85+
shared_ptr<Blob<Dtype> > sigmoid_input_;
86+
/// sigmoid_output_ stores the output of the SigmoidLayer.
87+
shared_ptr<Blob<Dtype> > sigmoid_output_;
88+
/// bottom vector holder to call the underlying SigmoidLayer::Forward
89+
vector<Blob<Dtype>*> sigmoid_bottom_vec_;
90+
/// top vector holder to call the underlying SigmoidLayer::Forward
91+
vector<Blob<Dtype>*> sigmoid_top_vec_;
92+
};
93+
94+
} // namespace caffe
95+
96+
#endif // CAFFE_SWISH_LAYER_HPP_

src/caffe/layers/swish_layer.cpp

+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#include <cmath>
2+
#include <vector>
3+
4+
#include "caffe/layers/swish_layer.hpp"
5+
#include "caffe/util/math_functions.hpp"
6+
7+
namespace caffe {
8+
9+
template <typename Dtype>
10+
void SwishLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
11+
const vector<Blob<Dtype>*>& top) {
12+
NeuronLayer<Dtype>::LayerSetUp(bottom, top);
13+
sigmoid_bottom_vec_.clear();
14+
sigmoid_bottom_vec_.push_back(sigmoid_input_.get());
15+
sigmoid_top_vec_.clear();
16+
sigmoid_top_vec_.push_back(sigmoid_output_.get());
17+
sigmoid_layer_->SetUp(sigmoid_bottom_vec_, sigmoid_top_vec_);
18+
}
19+
20+
template <typename Dtype>
21+
void SwishLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
22+
const vector<Blob<Dtype>*>& top) {
23+
NeuronLayer<Dtype>::Reshape(bottom, top);
24+
sigmoid_input_->ReshapeLike(*bottom[0]);
25+
sigmoid_layer_->Reshape(sigmoid_bottom_vec_, sigmoid_top_vec_);
26+
}
27+
28+
template <typename Dtype>
29+
void SwishLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
30+
const vector<Blob<Dtype>*>& top) {
31+
const Dtype* bottom_data = bottom[0]->cpu_data();
32+
Dtype* sigmoid_input_data = sigmoid_input_->mutable_cpu_data();
33+
Dtype* top_data = top[0]->mutable_cpu_data();
34+
const int count = bottom[0]->count();
35+
Dtype beta = this->layer_param_.swish_param().beta();
36+
caffe_copy(count, bottom_data, sigmoid_input_data);
37+
caffe_scal(count, beta, sigmoid_input_data);
38+
sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_);
39+
caffe_mul(count, bottom_data, sigmoid_output_->cpu_data(), top_data);
40+
}
41+
42+
template <typename Dtype>
43+
void SwishLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
44+
const vector<bool>& propagate_down,
45+
const vector<Blob<Dtype>*>& bottom) {
46+
if (propagate_down[0]) {
47+
const Dtype* top_data = top[0]->cpu_data();
48+
const Dtype* top_diff = top[0]->cpu_diff();
49+
const Dtype* sigmoid_output_data = sigmoid_output_->cpu_data();
50+
Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
51+
const int count = bottom[0]->count();
52+
Dtype beta = this->layer_param_.swish_param().beta();
53+
for (int i = 0; i < count; ++i) {
54+
const Dtype swish_x = top_data[i];
55+
bottom_diff[i] = top_diff[i] * (beta * swish_x + sigmoid_output_data[i]
56+
* (1. - beta * swish_x));
57+
}
58+
}
59+
}
60+
61+
#ifdef CPU_ONLY
62+
STUB_GPU(SwishLayer);
63+
#endif
64+
65+
INSTANTIATE_CLASS(SwishLayer);
66+
REGISTER_LAYER_CLASS(Swish);
67+
68+
} // namespace caffe

src/caffe/layers/swish_layer.cu

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#include <cmath>
2+
#include <vector>
3+
4+
#include "caffe/layers/swish_layer.hpp"
5+
#include "caffe/util/math_functions.hpp"
6+
7+
namespace caffe {
8+
9+
template <typename Dtype>
10+
void SwishLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
11+
const vector<Blob<Dtype>*>& top) {
12+
const Dtype* bottom_data = bottom[0]->gpu_data();
13+
Dtype* sigmoid_input_data = sigmoid_input_->mutable_gpu_data();
14+
Dtype* top_data = top[0]->mutable_gpu_data();
15+
const int count = bottom[0]->count();
16+
Dtype beta = this->layer_param_.swish_param().beta();
17+
caffe_copy(count, bottom_data, sigmoid_input_data);
18+
caffe_gpu_scal(count, beta, sigmoid_input_data);
19+
sigmoid_layer_->Forward(sigmoid_bottom_vec_, sigmoid_top_vec_);
20+
caffe_gpu_mul(count, bottom_data, sigmoid_output_->gpu_data(), top_data);
21+
}
22+
23+
template <typename Dtype>
24+
__global__ void SwishBackward(const int n, const Dtype* in_diff,
25+
const Dtype* out_data, const Dtype* sigmoid_output_data, Dtype* out_diff,
26+
const Dtype beta) {
27+
CUDA_KERNEL_LOOP(index, n) {
28+
const Dtype swish_x = out_data[index];
29+
out_diff[index] = in_diff[index] * (beta * swish_x
30+
+ sigmoid_output_data[index] * (1 - beta * swish_x));
31+
}
32+
}
33+
34+
template <typename Dtype>
35+
void SwishLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
36+
const vector<bool>& propagate_down,
37+
const vector<Blob<Dtype>*>& bottom) {
38+
if (propagate_down[0]) {
39+
const Dtype* top_data = top[0]->gpu_data();
40+
const Dtype* top_diff = top[0]->gpu_diff();
41+
const Dtype* sigmoid_output_data = sigmoid_output_->gpu_data();
42+
Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
43+
const int count = bottom[0]->count();
44+
Dtype beta = this->layer_param_.swish_param().beta();
45+
// NOLINT_NEXT_LINE(whitespace/operators)
46+
SwishBackward<Dtype><<<CAFFE_GET_BLOCKS(count), CAFFE_CUDA_NUM_THREADS>>>(
47+
count, top_diff, top_data, sigmoid_output_data, bottom_diff, beta);
48+
CUDA_POST_KERNEL_CHECK;
49+
}
50+
}
51+
52+
INSTANTIATE_LAYER_GPU_FUNCS(SwishLayer);
53+
54+
} // namespace caffe

src/caffe/proto/caffe.proto

+11-1
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,7 @@ message ParamSpec {
322322
// NOTE
323323
// Update the next available ID when you add a new LayerParameter field.
324324
//
325-
// LayerParameter next available layer-specific ID: 147 (last added: recurrent_param)
325+
// LayerParameter next available layer-specific ID: 148 (last added: swish_param)
326326
message LayerParameter {
327327
optional string name = 1; // the layer name
328328
optional string type = 2; // the layer type
@@ -415,6 +415,7 @@ message LayerParameter {
415415
optional SoftmaxParameter softmax_param = 125;
416416
optional SPPParameter spp_param = 132;
417417
optional SliceParameter slice_param = 126;
418+
optional SwishParameter swish_param = 147;
418419
optional TanHParameter tanh_param = 127;
419420
optional ThresholdParameter threshold_param = 128;
420421
optional TileParameter tile_param = 138;
@@ -1156,6 +1157,15 @@ message SoftmaxParameter {
11561157
optional int32 axis = 2 [default = 1];
11571158
}
11581159

1160+
// Message that stores parameters used by SwishLayer
1161+
message SwishParameter {
1162+
// Beta parameter for the Swish activation function
1163+
// Described in:
1164+
// Prajit Ramachandran, Barret Zoph, Quoc V. Le. (2017). Searching for
1165+
// Activation Functions. https://arxiv.org/abs/1710.05941v2
1166+
optional float beta = 1 [default = 1];
1167+
}
1168+
11591169
message TanHParameter {
11601170
enum Engine {
11611171
DEFAULT = 0;

src/caffe/test/test_neuron_layer.cpp

+79
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "caffe/layers/prelu_layer.hpp"
2020
#include "caffe/layers/relu_layer.hpp"
2121
#include "caffe/layers/sigmoid_layer.hpp"
22+
#include "caffe/layers/swish_layer.hpp"
2223
#include "caffe/layers/tanh_layer.hpp"
2324
#include "caffe/layers/threshold_layer.hpp"
2425

@@ -344,6 +345,84 @@ TYPED_TEST(NeuronLayerTest, TestSigmoidGradient) {
344345
this->blob_top_vec_);
345346
}
346347

348+
TYPED_TEST(NeuronLayerTest, TestSwish) {
349+
typedef typename TypeParam::Dtype Dtype;
350+
LayerParameter layer_param;
351+
SwishLayer<Dtype> layer(layer_param);
352+
layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
353+
layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
354+
// Now, check values
355+
const Dtype* bottom_data = this->blob_bottom_->cpu_data();
356+
const Dtype* top_data = this->blob_top_->cpu_data();
357+
for (int i = 0; i < this->blob_bottom_->count(); ++i) {
358+
EXPECT_FLOAT_EQ(top_data[i], bottom_data[i] / (1. + exp(-bottom_data[i])));
359+
}
360+
}
361+
362+
TYPED_TEST(NeuronLayerTest, TestSwishWithBeta) {
363+
typedef typename TypeParam::Dtype Dtype;
364+
LayerParameter layer_param;
365+
CHECK(google::protobuf::TextFormat::ParseFromString(
366+
"swish_param { beta: 1.5 }", &layer_param));
367+
SwishLayer<Dtype> layer(layer_param);
368+
layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
369+
layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
370+
// Now, check values
371+
const Dtype* bottom_data = this->blob_bottom_->cpu_data();
372+
const Dtype* top_data = this->blob_top_->cpu_data();
373+
for (int i = 0; i < this->blob_bottom_->count(); ++i) {
374+
EXPECT_FLOAT_EQ(top_data[i], bottom_data[i] / (1. + exp(-1.5 *
375+
bottom_data[i])));
376+
}
377+
}
378+
379+
TYPED_TEST(NeuronLayerTest, TestSwishAsLinear) {
380+
typedef typename TypeParam::Dtype Dtype;
381+
LayerParameter layer_param;
382+
CHECK(google::protobuf::TextFormat::ParseFromString(
383+
"swish_param { beta: 0.0 }", &layer_param));
384+
SwishLayer<Dtype> layer(layer_param);
385+
layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
386+
layer.Forward(this->blob_bottom_vec_, this->blob_top_vec_);
387+
// Now, check values
388+
const Dtype* bottom_data = this->blob_bottom_->cpu_data();
389+
const Dtype* top_data = this->blob_top_->cpu_data();
390+
for (int i = 0; i < this->blob_bottom_->count(); ++i) {
391+
EXPECT_FLOAT_EQ(top_data[i], bottom_data[i] / 2.0);
392+
}
393+
}
394+
395+
TYPED_TEST(NeuronLayerTest, TestSwishGradient) {
396+
typedef typename TypeParam::Dtype Dtype;
397+
LayerParameter layer_param;
398+
SwishLayer<Dtype> layer(layer_param);
399+
GradientChecker<Dtype> checker(1e-2, 1e-3, 1701, 0., 0.01);
400+
checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
401+
this->blob_top_vec_);
402+
}
403+
404+
TYPED_TEST(NeuronLayerTest, TestSwishWithBetaGradient) {
405+
typedef typename TypeParam::Dtype Dtype;
406+
LayerParameter layer_param;
407+
CHECK(google::protobuf::TextFormat::ParseFromString(
408+
"swish_param { beta: 1.5 }", &layer_param));
409+
SwishLayer<Dtype> layer(layer_param);
410+
GradientChecker<Dtype> checker(1e-2, 1e-3, 1701, 0., 0.01);
411+
checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
412+
this->blob_top_vec_);
413+
}
414+
415+
TYPED_TEST(NeuronLayerTest, TestSwishAsLinearGradient) {
416+
typedef typename TypeParam::Dtype Dtype;
417+
LayerParameter layer_param;
418+
CHECK(google::protobuf::TextFormat::ParseFromString(
419+
"swish_param { beta: 0.0 }", &layer_param));
420+
SwishLayer<Dtype> layer(layer_param);
421+
GradientChecker<Dtype> checker(1e-2, 1e-3, 1701, 0., 0.01);
422+
checker.CheckGradientEltwise(&layer, this->blob_bottom_vec_,
423+
this->blob_top_vec_);
424+
}
425+
347426
TYPED_TEST(NeuronLayerTest, TestTanH) {
348427
typedef typename TypeParam::Dtype Dtype;
349428
LayerParameter layer_param;

0 commit comments

Comments
 (0)