Skip to content

Commit d0e154d

Browse files
committed
Improve CUDA array header allocation
1 parent 666157b commit d0e154d

File tree

2 files changed

+105
-36
lines changed

2 files changed

+105
-36
lines changed

numba/cuda/cudadrv/devicearray.py

+38-28
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import warnings
88
import math
99
import numpy as np
10-
from .ndarray import ndarray_device_allocate_head, ndarray_populate_head
10+
from .ndarray import (ndarray_populate_head, ArrayHeaderManager)
1111
from . import driver as _driver
1212
from . import devices
1313
from numba import dummyarray
@@ -54,15 +54,23 @@ class DeviceNDArrayBase(object):
5454
def __init__(self, shape, strides, dtype, stream=0, writeback=None,
5555
gpu_head=None, gpu_data=None):
5656
"""
57-
Arguments
58-
59-
shape: array shape.
60-
strides: array strides.
61-
dtype: data type as numpy.dtype.
62-
stream: cuda stream.
63-
writeback: Deprecated.
64-
gpu_head: user provided device memory for the ndarray head structure
65-
gpu_data: user provided device memory for the ndarray data buffer
57+
Args
58+
----
59+
60+
shape
61+
array shape.
62+
strides
63+
array strides.
64+
dtype
65+
data type as numpy.dtype.
66+
stream
67+
cuda stream.
68+
writeback
69+
Deprecated.
70+
gpu_head
71+
user provided device memory for the ndarray head structure
72+
gpu_data
73+
user provided device memory for the ndarray data buffer
6674
"""
6775
if isinstance(shape, (int, long)):
6876
shape = (shape,)
@@ -86,27 +94,22 @@ def __init__(self, shape, strides, dtype, stream=0, writeback=None,
8694
else:
8795
self.alloc_size = _driver.device_memory_size(gpu_data)
8896

97+
self.gpu_mem = ArrayHeaderManager(devices.get_context())
98+
8999
if gpu_head is None:
90-
gpu_head = ndarray_device_allocate_head(self.ndim)
100+
gpu_head = self.gpu_mem.allocate(self.ndim)
91101
ndarray_populate_head(gpu_head, gpu_data, self.shape,
92102
self.strides, stream=stream)
93103
self.gpu_head = gpu_head
94104
self.gpu_data = gpu_data
95105

96106
self.__writeback = writeback # should deprecate the use of this
97107

98-
# define the array interface to work with numpy
99-
#
100-
# XXX: problem with data being accessed.
101-
# is NULL pointer alright?
102-
#
103-
# self.__array_interface__ = {
104-
# 'shape' : self.shape,
105-
# 'typestr' : self.dtype.str,
106-
# 'data' : (0, True),
107-
# 'version' : 3,
108-
# }
109-
108+
def __del__(self):
109+
try:
110+
self.gpu_mem.free(self.gpu_head)
111+
except:
112+
pass
110113

111114
@property
112115
def device_ctypes_pointer(self):
@@ -183,11 +186,11 @@ def split(self, section, stream=0):
183186
end = min(begin + section, self.size)
184187
shape = (end - begin,)
185188
gpu_data = self.gpu_data.view(begin * itemsize, end * itemsize)
186-
gpu_head = ndarray_device_allocate_head(1)
187-
ndarray_populate_head(gpu_head, gpu_data, shape, strides,
188-
stream=stream)
189+
# gpu_head = _allocate_head(1)
190+
# ndarray_populate_head(gpu_head, gpu_data, shape, strides,
191+
# stream=stream)
189192
yield DeviceNDArray(shape, strides, dtype=self.dtype, stream=stream,
190-
gpu_head=gpu_head, gpu_data=gpu_data)
193+
gpu_data=gpu_data)
191194

192195
def as_cuda_arg(self):
193196
"""Returns a device memory object that is used as the argument.
@@ -251,14 +254,21 @@ def __getitem__(self, item):
251254

252255
class MappedNDArray(DeviceNDArrayBase, np.ndarray):
253256
def device_setup(self, gpu_data, stream=0):
254-
gpu_head = ndarray_device_allocate_head(self.ndim)
257+
self.gpu_mem = ArrayHeaderManager(devices.get_context())
255258

259+
gpu_head = self.gpu_mem.allocate(self.ndim)
256260
ndarray_populate_head(gpu_head, gpu_data, self.shape,
257261
self.strides, stream=stream)
258262

259263
self.gpu_data = gpu_data
260264
self.gpu_head = gpu_head
261265

266+
def __del__(self):
267+
try:
268+
self.gpu_mem.free(self.gpu_head)
269+
except:
270+
pass
271+
262272

263273
def from_array_like(ary, stream=0, gpu_head=None, gpu_data=None):
264274
"Create a DeviceNDArray object that is like ary."

numba/cuda/cudadrv/ndarray.py

+67-8
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,70 @@
33
from . import devices, driver
44

55

6+
class ArrayHeaderManager(object):
7+
"""
8+
Manages array header memory for reusing the allocation.
9+
10+
It allocates one big chunk of memory and partition it for fix sized array
11+
header. It currently stores up to 4D array header in 64-bit mode or 8D
12+
array header in 32-bit mode.
13+
14+
This allows the small array header allocation to be reused to avoid
15+
breaking asynchronous streams and avoid fragmentation of memory.
16+
17+
When run out of preallocated space, it automatically fallback to regular
18+
allocation.
19+
"""
20+
context_map = {}
21+
22+
# The number of preallocated array head
23+
maxsize = 2 ** 10
24+
25+
# Maximum size for each array head
26+
# = 4 (ndim) * 8 (sizeof intp) * 2 (shape strides) + 8 (ptr)
27+
elemsize = 72
28+
29+
def __new__(cls, context):
30+
key = context.handle.value
31+
mm = cls.context_map.get(key)
32+
if mm is None:
33+
mm = object.__new__(cls)
34+
mm.init(context)
35+
cls.context_map[key] = mm
36+
37+
return mm
38+
39+
def init(self, context):
40+
self.context = context
41+
self.data = self.context.memalloc(self.elemsize * self.maxsize)
42+
self.queue = []
43+
for i in range(self.maxsize):
44+
offset = i * self.elemsize
45+
mem = self.data.view(offset, offset + self.elemsize)
46+
self.queue.append(mem)
47+
self.allocated = set()
48+
49+
def allocate(self, nd):
50+
arraytype = make_array_ctype(nd)
51+
sizeof = ctypes.sizeof(arraytype)
52+
53+
# Oversized or insufficient space
54+
if sizeof >= self.elemsize or not self.queue:
55+
return _allocate_head(nd)
56+
57+
mem = self.queue.pop()
58+
self.allocated.add(mem)
59+
return mem
60+
61+
def free(self, mem):
62+
if mem in self.allocated:
63+
self.allocated.discard(mem)
64+
self.queue.append(mem)
65+
66+
def __repr__(self):
67+
return "<cuda managed memory %s >" % (self.context.device,)
68+
69+
670
def make_array_ctype(ndim):
771
c_intp = ctypes.c_ssize_t
872

@@ -14,8 +78,9 @@ class c_array(ctypes.Structure):
1478
return c_array
1579

1680

17-
def ndarray_device_allocate_head(nd):
18-
"Allocate the metadata structure"
81+
def _allocate_head(nd):
82+
"""Allocate the metadata structure
83+
"""
1984
arraytype = make_array_ctype(nd)
2085
gpu_head = devices.get_context().memalloc(ctypes.sizeof(arraytype))
2186
return gpu_head
@@ -28,12 +93,6 @@ def ndarray_device_allocate_data(ary):
2893
return gpu_data
2994

3095

31-
def ndarray_device_transfer_data(ary, gpu_data, stream=0):
32-
size = driver.host_memory_size(ary)
33-
# transfer data
34-
driver.host_to_device(gpu_data, ary, size, stream=stream)
35-
36-
3796
def ndarray_populate_head(gpu_head, gpu_data, shape, strides, stream=0):
3897
nd = len(shape)
3998
assert nd > 0, "0 or negative dimension"

0 commit comments

Comments
 (0)