Skip to content

Commit 3ba2054

Browse files
committed
Switched multi-GPU to NCCL
1 parent 2317fa1 commit 3ba2054

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+813
-873
lines changed

CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ include(cmake/ConfigGen.cmake)
2828
# ---[ Options
2929
caffe_option(CPU_ONLY "Build Caffe without CUDA support" OFF) # TODO: rename to USE_CUDA
3030
caffe_option(USE_CUDNN "Build Caffe with cuDNN library support" ON IF NOT CPU_ONLY)
31+
caffe_option(USE_NCCL "Build Caffe with NCCL library support" OFF)
3132
caffe_option(BUILD_SHARED_LIBS "Build shared libraries" ON)
3233
caffe_option(BUILD_python "Build Python wrapper" ON)
3334
set(python_version "2" CACHE STRING "Specify which Python version to use")

Makefile

+6
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,12 @@ ifeq ($(USE_CUDNN), 1)
328328
COMMON_FLAGS += -DUSE_CUDNN
329329
endif
330330

331+
# NCCL acceleration configuration
332+
ifeq ($(USE_NCCL), 1)
333+
LIBRARIES += nccl
334+
COMMON_FLAGS += -DUSE_NCCL
335+
endif
336+
331337
# configure IO libraries
332338
ifeq ($(USE_OPENCV), 1)
333339
COMMON_FLAGS += -DUSE_OPENCV

Makefile.config.example

+4
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib
9494
# INCLUDE_DIRS += $(shell brew --prefix)/include
9595
# LIBRARY_DIRS += $(shell brew --prefix)/lib
9696

97+
# NCCL acceleration switch (uncomment to build with NCCL)
98+
# https://github.com/NVIDIA/nccl (last tested version: v1.2.3-1+cuda8.0)
99+
# USE_NCCL := 1
100+
97101
# Uncomment to use `pkg-config` to specify OpenCV library paths.
98102
# (Usually not necessary -- OpenCV libraries are normally installed in one of the above $LIBRARY_DIRS.)
99103
# USE_PKG_CONFIG := 1

cmake/Dependencies.cmake

+11-4
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,13 @@ if(NOT HAVE_CUDA)
6767
add_definitions(-DCPU_ONLY)
6868
endif()
6969

70+
if(USE_NCCL)
71+
find_package(NCCL REQUIRED)
72+
include_directories(SYSTEM ${NCCL_INCLUDE_DIR})
73+
list(APPEND Caffe_LINKER_LIBS ${NCCL_LIBRARIES})
74+
add_definitions(-DUSE_NCCL)
75+
endif()
76+
7077
# ---[ OpenCV
7178
if(USE_OPENCV)
7279
find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)
@@ -119,18 +126,18 @@ if(BUILD_python)
119126
find_package(NumPy 1.7.1)
120127
# Find the matching boost python implementation
121128
set(version ${PYTHONLIBS_VERSION_STRING})
122-
129+
123130
STRING( REGEX REPLACE "[^0-9]" "" boost_py_version ${version} )
124131
find_package(Boost 1.46 COMPONENTS "python-py${boost_py_version}")
125132
set(Boost_PYTHON_FOUND ${Boost_PYTHON-PY${boost_py_version}_FOUND})
126-
133+
127134
while(NOT "${version}" STREQUAL "" AND NOT Boost_PYTHON_FOUND)
128135
STRING( REGEX REPLACE "([0-9.]+).[0-9]+" "\\1" version ${version} )
129-
136+
130137
STRING( REGEX REPLACE "[^0-9]" "" boost_py_version ${version} )
131138
find_package(Boost 1.46 COMPONENTS "python-py${boost_py_version}")
132139
set(Boost_PYTHON_FOUND ${Boost_PYTHON-PY${boost_py_version}_FOUND})
133-
140+
134141
STRING( REGEX MATCHALL "([0-9.]+).[0-9]+" has_more_version ${version} )
135142
if("${has_more_version}" STREQUAL "")
136143
break()

cmake/Modules/FindNCCL.cmake

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
set(NCCL_INC_PATHS
2+
/usr/include
3+
/usr/local/include
4+
$ENV{NCCL_DIR}/include
5+
)
6+
7+
set(NCCL_LIB_PATHS
8+
/lib
9+
/lib64
10+
/usr/lib
11+
/usr/lib64
12+
/usr/local/lib
13+
/usr/local/lib64
14+
$ENV{NCCL_DIR}/lib
15+
)
16+
17+
find_path(NCCL_INCLUDE_DIR NAMES nccl.h PATHS ${NCCL_INC_PATHS})
18+
find_library(NCCL_LIBRARIES NAMES nccl PATHS ${NCCL_LIB_PATHS})
19+
20+
include(FindPackageHandleStandardArgs)
21+
find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIR NCCL_LIBRARIES)
22+
23+
if (NCCL_FOUND)
24+
message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIR}, library: ${NCCL_LIBRARIES})")
25+
mark_as_advanced(NCCL_INCLUDE_DIR NCCL_LIBRARIES)
26+
endif ()

cmake/Summary.cmake

+1
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ function(caffe_print_configuration_summary)
117117
caffe_status(" USE_OPENCV : ${USE_OPENCV}")
118118
caffe_status(" USE_LEVELDB : ${USE_LEVELDB}")
119119
caffe_status(" USE_LMDB : ${USE_LMDB}")
120+
caffe_status(" USE_NCCL : ${USE_NCCL}")
120121
caffe_status(" ALLOW_LMDB_NOLOCK : ${ALLOW_LMDB_NOLOCK}")
121122
caffe_status("")
122123
caffe_status("Dependencies:")

include/caffe/blob.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ class Blob {
220220
void set_cpu_data(Dtype* data);
221221
const int* gpu_shape() const;
222222
const Dtype* gpu_data() const;
223+
void set_gpu_data(Dtype* data);
223224
const Dtype* cpu_diff() const;
224225
const Dtype* gpu_diff() const;
225226
Dtype* mutable_cpu_data();

include/caffe/common.hpp

+10-4
Original file line numberDiff line numberDiff line change
@@ -158,11 +158,14 @@ class Caffe {
158158
// Search from start_id to the highest possible device ordinal,
159159
// return the ordinal of the first available device.
160160
static int FindDevice(const int start_id = 0);
161-
// Parallel training info
161+
// Parallel training
162162
inline static int solver_count() { return Get().solver_count_; }
163163
inline static void set_solver_count(int val) { Get().solver_count_ = val; }
164-
inline static bool root_solver() { return Get().root_solver_; }
165-
inline static void set_root_solver(bool val) { Get().root_solver_ = val; }
164+
inline static int solver_rank() { return Get().solver_rank_; }
165+
inline static void set_solver_rank(int val) { Get().solver_rank_ = val; }
166+
inline static bool multiprocess() { return Get().multiprocess_; }
167+
inline static void set_multiprocess(bool val) { Get().multiprocess_ = val; }
168+
inline static bool root_solver() { return Get().solver_rank_ == 0; }
166169

167170
protected:
168171
#ifndef CPU_ONLY
@@ -172,8 +175,11 @@ class Caffe {
172175
shared_ptr<RNG> random_generator_;
173176

174177
Brew mode_;
178+
179+
// Parallel training
175180
int solver_count_;
176-
bool root_solver_;
181+
int solver_rank_;
182+
bool multiprocess_;
177183

178184
private:
179185
// The private constructor to avoid duplicate instantiation.

include/caffe/data_reader.hpp

-82
This file was deleted.

include/caffe/internal_thread.hpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@ class InternalThread {
4242
bool must_stop();
4343

4444
private:
45-
void entry(int device, Caffe::Brew mode, int rand_seed, int solver_count,
46-
bool root_solver);
45+
void entry(int device, Caffe::Brew mode, int rand_seed,
46+
int solver_count, int solver_rank, bool multiprocess);
4747

4848
shared_ptr<boost::thread> thread_;
4949
};

include/caffe/layer.hpp

+1-42
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ class Layer {
3838
* layer.
3939
*/
4040
explicit Layer(const LayerParameter& param)
41-
: layer_param_(param), is_shared_(false) {
41+
: layer_param_(param) {
4242
// Set phase and copy blobs (if there are any).
4343
phase_ = param.phase();
4444
if (layer_param_.blobs_size() > 0) {
@@ -66,7 +66,6 @@ class Layer {
6666
*/
6767
void SetUp(const vector<Blob<Dtype>*>& bottom,
6868
const vector<Blob<Dtype>*>& top) {
69-
InitMutex();
7069
CheckBlobCounts(bottom, top);
7170
LayerSetUp(bottom, top);
7271
Reshape(bottom, top);
@@ -92,30 +91,6 @@ class Layer {
9291
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
9392
const vector<Blob<Dtype>*>& top) {}
9493

95-
/**
96-
* @brief Whether a layer should be shared by multiple nets during data
97-
* parallelism. By default, all layers except for data layers should
98-
* not be shared. data layers should be shared to ensure each worker
99-
* solver access data sequentially during data parallelism.
100-
*/
101-
virtual inline bool ShareInParallel() const { return false; }
102-
103-
/** @brief Return whether this layer is actually shared by other nets.
104-
* If ShareInParallel() is true and using more than one GPU and the
105-
* net has TRAIN phase, then this function is expected return true.
106-
*/
107-
inline bool IsShared() const { return is_shared_; }
108-
109-
/** @brief Set whether this layer is actually shared by other nets
110-
* If ShareInParallel() is true and using more than one GPU and the
111-
* net has TRAIN phase, then is_shared should be set true.
112-
*/
113-
inline void SetShared(bool is_shared) {
114-
CHECK(ShareInParallel() || !is_shared)
115-
<< type() << "Layer does not support sharing.";
116-
is_shared_ = is_shared;
117-
}
118-
11994
/**
12095
* @brief Adjust the shapes of top blobs and internal buffers to accommodate
12196
* the shapes of the bottom blobs.
@@ -428,19 +403,6 @@ class Layer {
428403
}
429404

430405
private:
431-
/** Whether this layer is actually shared by other nets*/
432-
bool is_shared_;
433-
434-
/** The mutex for sequential forward if this layer is shared */
435-
shared_ptr<boost::mutex> forward_mutex_;
436-
437-
/** Initialize forward_mutex_ */
438-
void InitMutex();
439-
/** Lock forward_mutex_ if this layer is shared */
440-
void Lock();
441-
/** Unlock forward_mutex_ if this layer is shared */
442-
void Unlock();
443-
444406
DISABLE_COPY_AND_ASSIGN(Layer);
445407
}; // class Layer
446408

@@ -450,8 +412,6 @@ class Layer {
450412
template <typename Dtype>
451413
inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
452414
const vector<Blob<Dtype>*>& top) {
453-
// Lock during forward to ensure sequential forward
454-
Lock();
455415
Dtype loss = 0;
456416
Reshape(bottom, top);
457417
switch (Caffe::mode()) {
@@ -482,7 +442,6 @@ inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
482442
default:
483443
LOG(FATAL) << "Unknown caffe mode.";
484444
}
485-
Unlock();
486445
return loss;
487446
}
488447

include/caffe/layers/base_data_layer.hpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -68,15 +68,16 @@ class BasePrefetchingDataLayer :
6868
const vector<Blob<Dtype>*>& top);
6969

7070
// Prefetches batches (asynchronously if to GPU memory)
71-
static const int PREFETCH_COUNT = 3;
71+
static const int PREFETCH_COUNT = 4; // same as proto
7272

7373
protected:
7474
virtual void InternalThreadEntry();
7575
virtual void load_batch(Batch<Dtype>* batch) = 0;
7676

77-
Batch<Dtype> prefetch_[PREFETCH_COUNT];
77+
vector<shared_ptr<Batch<Dtype> > > prefetch_;
7878
BlockingQueue<Batch<Dtype>*> prefetch_free_;
7979
BlockingQueue<Batch<Dtype>*> prefetch_full_;
80+
Batch<Dtype>* prefetch_current_;
8081

8182
Blob<Dtype> transformed_data_;
8283
};

include/caffe/layers/data_layer.hpp

+5-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
#include <vector>
55

66
#include "caffe/blob.hpp"
7-
#include "caffe/data_reader.hpp"
87
#include "caffe/data_transformer.hpp"
98
#include "caffe/internal_thread.hpp"
109
#include "caffe/layer.hpp"
@@ -29,9 +28,13 @@ class DataLayer : public BasePrefetchingDataLayer<Dtype> {
2928
virtual inline int MaxTopBlobs() const { return 2; }
3029

3130
protected:
31+
void Next();
32+
bool Skip();
3233
virtual void load_batch(Batch<Dtype>* batch);
3334

34-
DataReader reader_;
35+
shared_ptr<db::DB> db_;
36+
shared_ptr<db::Cursor> cursor_;
37+
uint64_t offset_;
3538
};
3639

3740
} // namespace caffe

include/caffe/layers/hdf5_data_layer.hpp

+5-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ template <typename Dtype>
2323
class HDF5DataLayer : public Layer<Dtype> {
2424
public:
2525
explicit HDF5DataLayer(const LayerParameter& param)
26-
: Layer<Dtype>(param) {}
26+
: Layer<Dtype>(param), offset_() {}
2727
virtual ~HDF5DataLayer();
2828
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
2929
const vector<Blob<Dtype>*>& top);
@@ -38,6 +38,9 @@ class HDF5DataLayer : public Layer<Dtype> {
3838
virtual inline int MinTopBlobs() const { return 1; }
3939

4040
protected:
41+
void Next();
42+
bool Skip();
43+
4144
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
4245
const vector<Blob<Dtype>*>& top);
4346
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
@@ -55,6 +58,7 @@ class HDF5DataLayer : public Layer<Dtype> {
5558
std::vector<shared_ptr<Blob<Dtype> > > hdf_blobs_;
5659
std::vector<unsigned int> data_permutation_;
5760
std::vector<unsigned int> file_permutation_;
61+
uint64_t offset_;
5862
};
5963

6064
} // namespace caffe

0 commit comments

Comments
 (0)