Skip to content

Commit 8f6c455

Browse files
authored
Merge pull request #67 from rflamary/new_gpu
[MRG] New ot.gpu with cupy
2 parents 4b05176 + 08b859d commit 8f6c455

File tree

9 files changed

+338
-272
lines changed

9 files changed

+338
-272
lines changed

README.md

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ This open source Python library provide several solvers for optimization problem
1414
It provides the following solvers:
1515

1616
* OT Network Flow solver for the linear program/ Earth Movers Distance [1].
17-
* Entropic regularization OT solver with Sinkhorn Knopp Algorithm [2] and stabilized version [9][10] and greedy SInkhorn [22] with optional GPU implementation (requires cudamat).
17+
* Entropic regularization OT solver with Sinkhorn Knopp Algorithm [2] and stabilized version [9][10] and greedy SInkhorn [22] with optional GPU implementation (requires cupy).
1818
* Smooth optimal transport solvers (dual and semi-dual) for KL and squared L2 regularizations [17].
1919
* Non regularized Wasserstein barycenters [16] with LP solver (only small scale).
2020
* Bregman projections for Wasserstein barycenter [3], convolutional barycenter [21] and unmixing [4].
@@ -83,12 +83,8 @@ Some sub-modules require additional dependences which are discussed below
8383
```
8484
pip install pymanopt autograd
8585
```
86-
* **ot.gpu** (GPU accelerated OT) depends on cudamat that have to be installed with:
87-
```
88-
git clone https://github.com/cudamat/cudamat.git
89-
cd cudamat
90-
python setup.py install --user # for user install (no root)
91-
```
86+
* **ot.gpu** (GPU accelerated OT) depends on cupy that have to be installed following instructions on [this page](https://docs-cupy.chainer.org/en/stable/install.html).
87+
9288

9389
obviously you need CUDA installed and a compatible GPU.
9490

docs/source/all.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,12 @@ ot.da
4848

4949
.. automodule:: ot.da
5050
:members:
51+
52+
ot.gpu
53+
--------
54+
55+
.. automodule:: ot.gpu
56+
:members:
5157

5258
ot.dr
5359
--------

docs/source/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ class Mock(MagicMock):
3131
@classmethod
3232
def __getattr__(cls, name):
3333
return MagicMock()
34-
MOCK_MODULES = ['ot.lp.emd_wrap','autograd','pymanopt','cudamat','autograd.numpy','pymanopt.manifolds','pymanopt.solvers']
34+
MOCK_MODULES = ['ot.lp.emd_wrap','autograd','pymanopt','cupy','autograd.numpy','pymanopt.manifolds','pymanopt.solvers']
3535
# 'autograd.numpy','pymanopt.manifolds','pymanopt.solvers',
3636
sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
3737
# !!!!

ot/gpu/__init__.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,36 @@
11
# -*- coding: utf-8 -*-
2+
"""
3+
4+
This module provides GPU implementation for several OT solvers and utility
5+
functions. The GPU backend in handled by `cupy
6+
<https://cupy.chainer.org/>`_.
7+
8+
By default, the functions in this module accept and return numpy arrays
9+
in order to proide drop-in replacement for the other POT function but
10+
the transfer between CPU en GPU comes with a significant overhead.
11+
12+
In order to get the best erformances, we recommend to given only cupy
13+
arrays to the functions and desactivate the conversion to numpy of the
14+
result of the function with parameter ``to_numpy=False``.
15+
16+
17+
18+
19+
"""
220

321
from . import bregman
422
from . import da
523
from .bregman import sinkhorn
24+
from .da import sinkhorn_lpl1_mm
25+
26+
from . import utils
27+
from .utils import dist, to_gpu, to_np
28+
629

730
# Author: Remi Flamary <[email protected]>
831
# Leo Gautheron <https://github.com/aje>
932
#
1033
# License: MIT License
1134

12-
__all__ = ["bregman", "da", "sinkhorn"]
35+
__all__ = ["utils", "dist", "sinkhorn",
36+
"sinkhorn_lpl1_mm", 'bregman', 'da', 'to_gpu', 'to_np']

ot/gpu/bregman.py

Lines changed: 92 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,18 @@
88
#
99
# License: MIT License
1010

11-
import numpy as np
12-
import cudamat
11+
import cupy as np # np used for matrix computation
12+
import cupy as cp # cp used for cupy specific operations
13+
from . import utils
1314

1415

15-
def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False,
16-
log=False, returnAsGPU=False):
17-
r"""
18-
Solve the entropic regularization optimal transport problem on GPU
16+
def sinkhorn_knopp(a, b, M, reg, numItermax=1000, stopThr=1e-9,
17+
verbose=False, log=False, to_numpy=True, **kwargs):
18+
"""
19+
Solve the entropic regularization optimal transport on GPU
20+
21+
If the input matrix are in numpy format, they will be uploaded to the
22+
GPU first which can incur significant time overhead.
1923
2024
The function solves the following optimization problem:
2125
@@ -40,9 +44,10 @@ def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False,
4044
----------
4145
a : np.ndarray (ns,)
4246
samples weights in the source domain
43-
b : np.ndarray (nt,)
44-
samples in the target domain
45-
M_GPU : cudamat.CUDAMatrix (ns,nt)
47+
b : np.ndarray (nt,) or np.ndarray (nt,nbb)
48+
samples in the target domain, compute sinkhorn with multiple targets
49+
and fixed M if b is a matrix (return OT loss + dual variables in log)
50+
M : np.ndarray (ns,nt)
4651
loss matrix
4752
reg : float
4853
Regularization term >0
@@ -54,8 +59,9 @@ def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False,
5459
Print information along iterations
5560
log : bool, optional
5661
record log if True
57-
returnAsGPU : bool, optional
58-
return the OT matrix as a cudamat.CUDAMatrix
62+
to_numpy : boolean, optional (default True)
63+
If true convert back the GPU array result to numpy format.
64+
5965
6066
Returns
6167
-------
@@ -88,60 +94,78 @@ def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False,
8894
ot.optim.cg : General regularized OT
8995
9096
"""
97+
98+
a = cp.asarray(a)
99+
b = cp.asarray(b)
100+
M = cp.asarray(M)
101+
102+
if len(a) == 0:
103+
a = np.ones((M.shape[0],)) / M.shape[0]
104+
if len(b) == 0:
105+
b = np.ones((M.shape[1],)) / M.shape[1]
106+
91107
# init data
92108
Nini = len(a)
93109
Nfin = len(b)
94110

111+
if len(b.shape) > 1:
112+
nbb = b.shape[1]
113+
else:
114+
nbb = 0
115+
95116
if log:
96117
log = {'err': []}
97118

98119
# we assume that no distances are null except those of the diagonal of
99120
# distances
100-
u = (np.ones(Nini) / Nini).reshape((Nini, 1))
101-
u_GPU = cudamat.CUDAMatrix(u)
102-
a_GPU = cudamat.CUDAMatrix(a.reshape((Nini, 1)))
103-
ones_GPU = cudamat.empty(u_GPU.shape).assign(1)
104-
v = (np.ones(Nfin) / Nfin).reshape((Nfin, 1))
105-
v_GPU = cudamat.CUDAMatrix(v)
106-
b_GPU = cudamat.CUDAMatrix(b.reshape((Nfin, 1)))
107-
108-
M_GPU.divide(-reg)
121+
if nbb:
122+
u = np.ones((Nini, nbb)) / Nini
123+
v = np.ones((Nfin, nbb)) / Nfin
124+
else:
125+
u = np.ones(Nini) / Nini
126+
v = np.ones(Nfin) / Nfin
109127

110-
K_GPU = cudamat.exp(M_GPU)
128+
# print(reg)
111129

112-
ones_GPU.divide(a_GPU, target=a_GPU)
113-
Kp_GPU = cudamat.empty(K_GPU.shape)
114-
K_GPU.mult_by_col(a_GPU, target=Kp_GPU)
130+
# Next 3 lines equivalent to K= np.exp(-M/reg), but faster to compute
131+
K = np.empty(M.shape, dtype=M.dtype)
132+
np.divide(M, -reg, out=K)
133+
np.exp(K, out=K)
115134

116-
tmp_GPU = cudamat.empty(K_GPU.shape)
135+
# print(np.min(K))
136+
tmp2 = np.empty(b.shape, dtype=M.dtype)
117137

138+
Kp = (1 / a).reshape(-1, 1) * K
118139
cpt = 0
119140
err = 1
120141
while (err > stopThr and cpt < numItermax):
121-
uprev_GPU = u_GPU.copy()
122-
vprev_GPU = v_GPU.copy()
142+
uprev = u
143+
vprev = v
123144

124-
KtransposeU_GPU = K_GPU.transpose().dot(u_GPU)
125-
b_GPU.divide(KtransposeU_GPU, target=v_GPU)
126-
ones_GPU.divide(Kp_GPU.dot(v_GPU), target=u_GPU)
145+
KtransposeU = np.dot(K.T, u)
146+
v = np.divide(b, KtransposeU)
147+
u = 1. / np.dot(Kp, v)
127148

128-
if (np.any(KtransposeU_GPU.asarray() == 0) or
129-
not u_GPU.allfinite() or not v_GPU.allfinite()):
149+
if (np.any(KtransposeU == 0) or
150+
np.any(np.isnan(u)) or np.any(np.isnan(v)) or
151+
np.any(np.isinf(u)) or np.any(np.isinf(v))):
130152
# we have reached the machine precision
131153
# come back to previous solution and quit loop
132154
print('Warning: numerical errors at iteration', cpt)
133-
u_GPU = uprev_GPU.copy()
134-
v_GPU = vprev_GPU.copy()
155+
u = uprev
156+
v = vprev
135157
break
136158
if cpt % 10 == 0:
137159
# we can speed up the process by checking for the error only all
138160
# the 10th iterations
139-
K_GPU.mult_by_col(u_GPU, target=tmp_GPU)
140-
tmp_GPU.mult_by_row(v_GPU.transpose(), target=tmp_GPU)
141-
142-
bcopy_GPU = b_GPU.copy().transpose()
143-
bcopy_GPU.add_sums(tmp_GPU, axis=0, beta=-1)
144-
err = bcopy_GPU.euclid_norm()**2
161+
if nbb:
162+
err = np.sum((u - uprev)**2) / np.sum((u)**2) + \
163+
np.sum((v - vprev)**2) / np.sum((v)**2)
164+
else:
165+
# compute right marginal tmp2= (diag(u)Kdiag(v))^T1
166+
tmp2 = np.sum(u[:, None] * K * v[None, :], 0)
167+
#tmp2=np.einsum('i,ij,j->j', u, K, v)
168+
err = np.linalg.norm(tmp2 - b)**2 # violation of marginal
145169
if log:
146170
log['err'].append(err)
147171

@@ -150,20 +174,32 @@ def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False,
150174
print(
151175
'{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
152176
print('{:5d}|{:8e}|'.format(cpt, err))
153-
cpt += 1
154-
if log:
155-
log['u'] = u_GPU.asarray()
156-
log['v'] = v_GPU.asarray()
157-
158-
K_GPU.mult_by_col(u_GPU, target=K_GPU)
159-
K_GPU.mult_by_row(v_GPU.transpose(), target=K_GPU)
160-
161-
if returnAsGPU:
162-
res = K_GPU
163-
else:
164-
res = K_GPU.asarray()
165-
177+
cpt = cpt + 1
166178
if log:
167-
return res, log
168-
else:
169-
return res
179+
log['u'] = u
180+
log['v'] = v
181+
182+
if nbb: # return only loss
183+
#res = np.einsum('ik,ij,jk,ij->k', u, K, v, M) (explodes cupy memory)
184+
res = np.empty(nbb)
185+
for i in range(nbb):
186+
res[i] = np.sum(u[:, None, i] * (K * M) * v[None, :, i])
187+
if to_numpy:
188+
res = utils.to_np(res)
189+
if log:
190+
return res, log
191+
else:
192+
return res
193+
194+
else: # return OT matrix
195+
res = u.reshape((-1, 1)) * K * v.reshape((1, -1))
196+
if to_numpy:
197+
res = utils.to_np(res)
198+
if log:
199+
return res, log
200+
else:
201+
return res
202+
203+
204+
# define sinkhorn as sinkhorn_knopp
205+
sinkhorn = sinkhorn_knopp

0 commit comments

Comments
 (0)