Skip to content

Commit 3ba2054

Browse files
committed
Switched multi-GPU to NCCL
1 parent 2317fa1 commit 3ba2054

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+813
-873
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ include(cmake/ConfigGen.cmake)
2828
# ---[ Options
2929
caffe_option(CPU_ONLY "Build Caffe without CUDA support" OFF) # TODO: rename to USE_CUDA
3030
caffe_option(USE_CUDNN "Build Caffe with cuDNN library support" ON IF NOT CPU_ONLY)
31+
caffe_option(USE_NCCL "Build Caffe with NCCL library support" OFF)
3132
caffe_option(BUILD_SHARED_LIBS "Build shared libraries" ON)
3233
caffe_option(BUILD_python "Build Python wrapper" ON)
3334
set(python_version "2" CACHE STRING "Specify which Python version to use")

Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,12 @@ ifeq ($(USE_CUDNN), 1)
328328
COMMON_FLAGS += -DUSE_CUDNN
329329
endif
330330

331+
# NCCL acceleration configuration
332+
ifeq ($(USE_NCCL), 1)
333+
LIBRARIES += nccl
334+
COMMON_FLAGS += -DUSE_NCCL
335+
endif
336+
331337
# configure IO libraries
332338
ifeq ($(USE_OPENCV), 1)
333339
COMMON_FLAGS += -DUSE_OPENCV

Makefile.config.example

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib
9494
# INCLUDE_DIRS += $(shell brew --prefix)/include
9595
# LIBRARY_DIRS += $(shell brew --prefix)/lib
9696

97+
# NCCL acceleration switch (uncomment to build with NCCL)
98+
# https://github.com/NVIDIA/nccl (last tested version: v1.2.3-1+cuda8.0)
99+
# USE_NCCL := 1
100+
97101
# Uncomment to use `pkg-config` to specify OpenCV library paths.
98102
# (Usually not necessary -- OpenCV libraries are normally installed in one of the above $LIBRARY_DIRS.)
99103
# USE_PKG_CONFIG := 1

cmake/Dependencies.cmake

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,13 @@ if(NOT HAVE_CUDA)
6767
add_definitions(-DCPU_ONLY)
6868
endif()
6969

70+
if(USE_NCCL)
71+
find_package(NCCL REQUIRED)
72+
include_directories(SYSTEM ${NCCL_INCLUDE_DIR})
73+
list(APPEND Caffe_LINKER_LIBS ${NCCL_LIBRARIES})
74+
add_definitions(-DUSE_NCCL)
75+
endif()
76+
7077
# ---[ OpenCV
7178
if(USE_OPENCV)
7279
find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)
@@ -119,18 +126,18 @@ if(BUILD_python)
119126
find_package(NumPy 1.7.1)
120127
# Find the matching boost python implementation
121128
set(version ${PYTHONLIBS_VERSION_STRING})
122-
129+
123130
STRING( REGEX REPLACE "[^0-9]" "" boost_py_version ${version} )
124131
find_package(Boost 1.46 COMPONENTS "python-py${boost_py_version}")
125132
set(Boost_PYTHON_FOUND ${Boost_PYTHON-PY${boost_py_version}_FOUND})
126-
133+
127134
while(NOT "${version}" STREQUAL "" AND NOT Boost_PYTHON_FOUND)
128135
STRING( REGEX REPLACE "([0-9.]+).[0-9]+" "\\1" version ${version} )
129-
136+
130137
STRING( REGEX REPLACE "[^0-9]" "" boost_py_version ${version} )
131138
find_package(Boost 1.46 COMPONENTS "python-py${boost_py_version}")
132139
set(Boost_PYTHON_FOUND ${Boost_PYTHON-PY${boost_py_version}_FOUND})
133-
140+
134141
STRING( REGEX MATCHALL "([0-9.]+).[0-9]+" has_more_version ${version} )
135142
if("${has_more_version}" STREQUAL "")
136143
break()

cmake/Modules/FindNCCL.cmake

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
set(NCCL_INC_PATHS
2+
/usr/include
3+
/usr/local/include
4+
$ENV{NCCL_DIR}/include
5+
)
6+
7+
set(NCCL_LIB_PATHS
8+
/lib
9+
/lib64
10+
/usr/lib
11+
/usr/lib64
12+
/usr/local/lib
13+
/usr/local/lib64
14+
$ENV{NCCL_DIR}/lib
15+
)
16+
17+
find_path(NCCL_INCLUDE_DIR NAMES nccl.h PATHS ${NCCL_INC_PATHS})
18+
find_library(NCCL_LIBRARIES NAMES nccl PATHS ${NCCL_LIB_PATHS})
19+
20+
include(FindPackageHandleStandardArgs)
21+
find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIR NCCL_LIBRARIES)
22+
23+
if (NCCL_FOUND)
24+
message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIR}, library: ${NCCL_LIBRARIES})")
25+
mark_as_advanced(NCCL_INCLUDE_DIR NCCL_LIBRARIES)
26+
endif ()

cmake/Summary.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ function(caffe_print_configuration_summary)
117117
caffe_status(" USE_OPENCV : ${USE_OPENCV}")
118118
caffe_status(" USE_LEVELDB : ${USE_LEVELDB}")
119119
caffe_status(" USE_LMDB : ${USE_LMDB}")
120+
caffe_status(" USE_NCCL : ${USE_NCCL}")
120121
caffe_status(" ALLOW_LMDB_NOLOCK : ${ALLOW_LMDB_NOLOCK}")
121122
caffe_status("")
122123
caffe_status("Dependencies:")

include/caffe/blob.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ class Blob {
220220
void set_cpu_data(Dtype* data);
221221
const int* gpu_shape() const;
222222
const Dtype* gpu_data() const;
223+
void set_gpu_data(Dtype* data);
223224
const Dtype* cpu_diff() const;
224225
const Dtype* gpu_diff() const;
225226
Dtype* mutable_cpu_data();

include/caffe/common.hpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -158,11 +158,14 @@ class Caffe {
158158
// Search from start_id to the highest possible device ordinal,
159159
// return the ordinal of the first available device.
160160
static int FindDevice(const int start_id = 0);
161-
// Parallel training info
161+
// Parallel training
162162
inline static int solver_count() { return Get().solver_count_; }
163163
inline static void set_solver_count(int val) { Get().solver_count_ = val; }
164-
inline static bool root_solver() { return Get().root_solver_; }
165-
inline static void set_root_solver(bool val) { Get().root_solver_ = val; }
164+
inline static int solver_rank() { return Get().solver_rank_; }
165+
inline static void set_solver_rank(int val) { Get().solver_rank_ = val; }
166+
inline static bool multiprocess() { return Get().multiprocess_; }
167+
inline static void set_multiprocess(bool val) { Get().multiprocess_ = val; }
168+
inline static bool root_solver() { return Get().solver_rank_ == 0; }
166169

167170
protected:
168171
#ifndef CPU_ONLY
@@ -172,8 +175,11 @@ class Caffe {
172175
shared_ptr<RNG> random_generator_;
173176

174177
Brew mode_;
178+
179+
// Parallel training
175180
int solver_count_;
176-
bool root_solver_;
181+
int solver_rank_;
182+
bool multiprocess_;
177183

178184
private:
179185
// The private constructor to avoid duplicate instantiation.

include/caffe/data_reader.hpp

Lines changed: 0 additions & 82 deletions
This file was deleted.

include/caffe/internal_thread.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@ class InternalThread {
4242
bool must_stop();
4343

4444
private:
45-
void entry(int device, Caffe::Brew mode, int rand_seed, int solver_count,
46-
bool root_solver);
45+
void entry(int device, Caffe::Brew mode, int rand_seed,
46+
int solver_count, int solver_rank, bool multiprocess);
4747

4848
shared_ptr<boost::thread> thread_;
4949
};

0 commit comments

Comments
 (0)