Skip to content

Commit ad101c9

Browse files
committed
Merge pull request #15139 from JuliaLang/jr/alignment
Request 64-byte aligned memory instead of 16-byte aligned memory for large objects
2 parents 1c0bbb2 + 8ee170f commit ad101c9

File tree

4 files changed

+48
-40
lines changed

4 files changed

+48
-40
lines changed

src/array.c

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ extern "C" {
1818

1919
#define JL_ARRAY_ALIGN(jl_value, nbytes) LLT_ALIGN(jl_value, nbytes)
2020

21-
2221
// array constructors ---------------------------------------------------------
2322

2423
static inline int store_unboxed(jl_value_t *el_type)
@@ -74,13 +73,13 @@ static jl_array_t *_new_array_(jl_value_t *atype, uint32_t ndims, size_t *dims,
7473
}
7574

7675
int ndimwords = jl_array_ndimwords(ndims);
77-
int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t), 16);
76+
int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t), JL_CACHE_BYTE_ALIGNMENT);
7877
if (tot <= ARRAY_INLINE_NBYTES) {
7978
if (isunboxed && elsz >= 4)
80-
tsz = JL_ARRAY_ALIGN(tsz, 16); // align data area 16
79+
tsz = JL_ARRAY_ALIGN(tsz, JL_SMALL_BYTE_ALIGNMENT); // align data area
8180
size_t doffs = tsz;
8281
tsz += tot;
83-
tsz = JL_ARRAY_ALIGN(tsz, 16); // align whole object 16
82+
tsz = JL_ARRAY_ALIGN(tsz, JL_SMALL_BYTE_ALIGNMENT); // align whole object
8483
a = (jl_array_t*)jl_gc_allocobj(tsz);
8584
jl_set_typeof(a, atype);
8685
a->flags.how = 0;
@@ -90,7 +89,7 @@ static jl_array_t *_new_array_(jl_value_t *atype, uint32_t ndims, size_t *dims,
9089
}
9190
}
9291
else {
93-
tsz = JL_ARRAY_ALIGN(tsz, 16); // align whole object 16
92+
tsz = JL_ARRAY_ALIGN(tsz, JL_CACHE_BYTE_ALIGNMENT); // align whole object
9493
a = (jl_array_t*)jl_gc_allocobj(tsz);
9594
JL_GC_PUSH1(&a);
9695
jl_set_typeof(a, atype);
@@ -157,7 +156,7 @@ JL_DLLEXPORT jl_array_t *jl_reshape_array(jl_value_t *atype, jl_array_t *data,
157156
size_t ndims = jl_nfields(dims);
158157

159158
int ndimwords = jl_array_ndimwords(ndims);
160-
int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t) + sizeof(void*), 16);
159+
int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t) + sizeof(void*), JL_SMALL_BYTE_ALIGNMENT);
161160
a = (jl_array_t*)jl_gc_allocobj(tsz);
162161
jl_set_typeof(a, atype);
163162
a->flags.pooled = tsz <= GC_MAX_SZCLASS;
@@ -233,7 +232,7 @@ JL_DLLEXPORT jl_array_t *jl_ptr_to_array_1d(jl_value_t *atype, void *data,
233232
elsz = sizeof(void*);
234233

235234
int ndimwords = jl_array_ndimwords(1);
236-
int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t), 16);
235+
int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t), JL_CACHE_BYTE_ALIGNMENT);
237236
a = (jl_array_t*)jl_gc_allocobj(tsz);
238237
jl_set_typeof(a, atype);
239238
a->flags.pooled = tsz <= GC_MAX_SZCLASS;
@@ -284,7 +283,7 @@ JL_DLLEXPORT jl_array_t *jl_ptr_to_array(jl_value_t *atype, void *data,
284283
elsz = sizeof(void*);
285284

286285
int ndimwords = jl_array_ndimwords(ndims);
287-
int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t), 16);
286+
int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t), JL_CACHE_BYTE_ALIGNMENT);
288287
a = (jl_array_t*)jl_gc_allocobj(tsz);
289288
jl_set_typeof(a, atype);
290289
a->flags.pooled = tsz <= GC_MAX_SZCLASS;

src/gc.c

Lines changed: 34 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,13 @@ typedef struct _bigval_t {
134134
size_t sz;
135135
uintptr_t age : 2;
136136
};
137+
#ifdef _P64 // Add padding so that char data[] below is 64-byte aligned
138+
// (8 pointers of 8 bytes each) - (4 other pointers in struct)
139+
void *_padding[8 - 4];
140+
#else
141+
// (16 pointers of 4 bytes each) - (4 other pointers in struct)
142+
void *_padding[16 - 4];
143+
#endif
137144
//struct buff_t <>;
138145
union {
139146
uintptr_t header;
@@ -145,7 +152,7 @@ typedef struct _bigval_t {
145152
#if !defined(_COMPILER_MICROSOFT_)
146153
int _dummy[0];
147154
#endif
148-
// must be 16-aligned here, in 32 & 64b
155+
// must be 64-byte aligned here, in 32 & 64 bit modes
149156
char data[];
150157
} bigval_t;
151158

@@ -170,7 +177,7 @@ typedef struct _pool_t {
170177

171178
#define GC_PAGE_LG2 14 // log2(size of a page)
172179
#define GC_PAGE_SZ (1 << GC_PAGE_LG2) // 16k
173-
#define GC_PAGE_OFFSET (16 - (sizeof_jl_taggedvalue_t % 16))
180+
#define GC_PAGE_OFFSET (JL_SMALL_BYTE_ALIGNMENT - (sizeof_jl_taggedvalue_t % JL_SMALL_BYTE_ALIGNMENT))
174181

175182
// pool page metadata
176183
typedef struct _gcpage_t {
@@ -440,15 +447,8 @@ static int jl_gc_finalizers_inhibited; // don't run finalizers during codegen #1
440447

441448
// malloc wrappers, aligned allocation
442449

443-
#if defined(_P64) || defined(__APPLE__)
444-
#define malloc_a16(sz) malloc(sz)
445-
#define realloc_a16(p, sz, oldsz) realloc((p), (sz))
446-
#define free_a16(p) free(p)
447-
#else
448-
#define malloc_a16(sz) jl_malloc_aligned(sz, 16)
449-
#define realloc_a16(p, sz, oldsz) jl_realloc_aligned(p, sz, oldsz, 16)
450-
#define free_a16(p) jl_free_aligned(p)
451-
#endif
450+
#define malloc_cache_align(sz) jl_malloc_aligned(sz, JL_CACHE_BYTE_ALIGNMENT)
451+
#define realloc_cache_align(p, sz, oldsz) jl_realloc_aligned(p, sz, oldsz, JL_CACHE_BYTE_ALIGNMENT)
452452

453453
static void schedule_finalization(void *o, void *f)
454454
{
@@ -973,10 +973,10 @@ static NOINLINE void *alloc_big(size_t sz)
973973
{
974974
maybe_collect();
975975
size_t offs = offsetof(bigval_t, header);
976-
size_t allocsz = LLT_ALIGN(sz + offs, 16);
976+
size_t allocsz = LLT_ALIGN(sz + offs, JL_CACHE_BYTE_ALIGNMENT);
977977
if (allocsz < sz) // overflow in adding offs, size was "negative"
978978
jl_throw(jl_memory_exception);
979-
bigval_t *v = (bigval_t*)malloc_a16(allocsz);
979+
bigval_t *v = (bigval_t*)malloc_cache_align(allocsz);
980980
if (v == NULL)
981981
jl_throw(jl_memory_exception);
982982
jl_atomic_fetch_add(&allocd_bytes, allocsz);
@@ -1036,7 +1036,7 @@ static bigval_t **sweep_big_list(int sweep_mask, bigval_t **pv)
10361036
#ifdef MEMDEBUG
10371037
memset(v, 0xbb, v->sz&~3);
10381038
#endif
1039-
free_a16(v);
1039+
jl_free_aligned(v);
10401040
big_freed++;
10411041
}
10421042
big_total++;
@@ -1103,7 +1103,7 @@ static void jl_gc_free_array(jl_array_t *a)
11031103
if (a->flags.how == 2) {
11041104
char *d = (char*)a->data - a->offset*a->elsize;
11051105
if (a->flags.isaligned)
1106-
free_a16(d);
1106+
jl_free_aligned(d);
11071107
else
11081108
free(d);
11091109
freed_bytes += array_nbytes(a);
@@ -2397,7 +2397,7 @@ void *reallocb(void *b, size_t sz)
23972397
if (allocsz < sz) // overflow in adding offs, size was "negative"
23982398
jl_throw(jl_memory_exception);
23992399
bigval_t *bv = bigval_header(buff);
2400-
bv = (bigval_t*)realloc_a16(bv, allocsz, bv->sz&~3);
2400+
bv = (bigval_t*)realloc_cache_align(bv, allocsz, bv->sz&~3);
24012401
if (bv == NULL)
24022402
jl_throw(jl_memory_exception);
24032403
return &bv->data[0];
@@ -2436,7 +2436,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_alloc_0w(void)
24362436

24372437
JL_DLLEXPORT jl_value_t *jl_gc_alloc_1w(void)
24382438
{
2439-
const int sz = LLT_ALIGN(sizeof_jl_taggedvalue_t + sizeof(void*), 16);
2439+
const int sz = LLT_ALIGN(sizeof_jl_taggedvalue_t + sizeof(void*), JL_SMALL_BYTE_ALIGNMENT);
24402440
void *tag = NULL;
24412441
#ifdef MEMDEBUG
24422442
tag = alloc_big(sz);
@@ -2449,7 +2449,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_alloc_1w(void)
24492449

24502450
JL_DLLEXPORT jl_value_t *jl_gc_alloc_2w(void)
24512451
{
2452-
const int sz = LLT_ALIGN(sizeof_jl_taggedvalue_t + sizeof(void*) * 2, 16);
2452+
const int sz = LLT_ALIGN(sizeof_jl_taggedvalue_t + sizeof(void*) * 2, JL_SMALL_BYTE_ALIGNMENT);
24532453
void *tag = NULL;
24542454
#ifdef MEMDEBUG
24552455
tag = alloc_big(sz);
@@ -2462,7 +2462,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_alloc_2w(void)
24622462

24632463
JL_DLLEXPORT jl_value_t *jl_gc_alloc_3w(void)
24642464
{
2465-
const int sz = LLT_ALIGN(sizeof_jl_taggedvalue_t + sizeof(void*) * 3, 16);
2465+
const int sz = LLT_ALIGN(sizeof_jl_taggedvalue_t + sizeof(void*) * 3, JL_SMALL_BYTE_ALIGNMENT);
24662466
void *tag = NULL;
24672467
#ifdef MEMDEBUG
24682468
tag = alloc_big(sz);
@@ -2509,7 +2509,7 @@ jl_thread_heap_t *jl_mk_thread_heap(void)
25092509
#ifdef JULIA_ENABLE_THREADING
25102510
// Cache-aligned malloc
25112511
jl_thread_heap =
2512-
(jl_thread_heap_t*)jl_malloc_aligned(sizeof(jl_thread_heap_t), 64);
2512+
(jl_thread_heap_t*)jl_malloc_aligned(sizeof(jl_thread_heap_t), JL_CACHE_BYTE_ALIGNMENT);
25132513
#endif
25142514
FOR_CURRENT_HEAP () {
25152515
const int *szc = sizeclasses;
@@ -2673,6 +2673,7 @@ static void big_obj_stats(void)
26732673

26742674
JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
26752675
{
2676+
sz += JL_SMALL_BYTE_ALIGNMENT;
26762677
maybe_collect();
26772678
allocd_bytes += sz;
26782679
gc_num.malloc++;
@@ -2684,6 +2685,7 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
26842685

26852686
JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
26862687
{
2688+
nm += JL_SMALL_BYTE_ALIGNMENT;
26872689
maybe_collect();
26882690
allocd_bytes += nm*sz;
26892691
gc_num.malloc++;
@@ -2696,15 +2698,15 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
26962698
JL_DLLEXPORT void jl_gc_counted_free(void *p, size_t sz)
26972699
{
26982700
free(p);
2699-
freed_bytes += sz;
2701+
freed_bytes += sz + JL_SMALL_BYTE_ALIGNMENT;
27002702
gc_num.freecall++;
27012703
}
27022704

2703-
JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old,
2704-
size_t sz)
2705+
JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz)
27052706
{
2707+
old += JL_SMALL_BYTE_ALIGNMENT;
2708+
sz += JL_SMALL_BYTE_ALIGNMENT;
27062709
maybe_collect();
2707-
27082710
if (sz < old)
27092711
freed_bytes += (old - sz);
27102712
else
@@ -2718,7 +2720,7 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old,
27182720

27192721
JL_DLLEXPORT void *jl_malloc(size_t sz)
27202722
{
2721-
int64_t *p = (int64_t *)jl_gc_counted_malloc(sz + 16);
2723+
int64_t *p = (int64_t *)jl_gc_counted_malloc(sz);
27222724
p[0] = sz;
27232725
return (void *)(p + 2);
27242726
}
@@ -2727,7 +2729,7 @@ JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
27272729
{
27282730
int64_t *p;
27292731
size_t nmsz = nm*sz;
2730-
p = (int64_t *)jl_gc_counted_calloc(nmsz + 16, 1);
2732+
p = (int64_t *)jl_gc_counted_calloc(nmsz, 1);
27312733
p[0] = nmsz;
27322734
return (void *)(p + 2);
27332735
}
@@ -2736,7 +2738,7 @@ JL_DLLEXPORT void jl_free(void *p)
27362738
{
27372739
int64_t *pp = (int64_t *)p - 2;
27382740
size_t sz = pp[0];
2739-
jl_gc_counted_free(pp, sz + 16);
2741+
jl_gc_counted_free(pp, sz);
27402742
}
27412743

27422744
JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
@@ -2751,20 +2753,20 @@ JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
27512753
pp = (int64_t *)p - 2;
27522754
szold = pp[0];
27532755
}
2754-
int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold + 16, sz + 16);
2756+
int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold, sz);
27552757
pnew[0] = sz;
27562758
return (void *)(pnew + 2);
27572759
}
27582760

27592761
JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
27602762
{
27612763
maybe_collect();
2762-
size_t allocsz = LLT_ALIGN(sz, 16);
2764+
size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT);
27632765
if (allocsz < sz) // overflow in adding offs, size was "negative"
27642766
jl_throw(jl_memory_exception);
27652767
allocd_bytes += allocsz;
27662768
gc_num.malloc++;
2767-
void *b = malloc_a16(allocsz);
2769+
void *b = malloc_cache_align(allocsz);
27682770
if (b == NULL)
27692771
jl_throw(jl_memory_exception);
27702772
return b;
@@ -2775,7 +2777,7 @@ JL_DLLEXPORT void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz,
27752777
{
27762778
maybe_collect();
27772779

2778-
size_t allocsz = LLT_ALIGN(sz, 16);
2780+
size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT);
27792781
if (allocsz < sz) // overflow in adding offs, size was "negative"
27802782
jl_throw(jl_memory_exception);
27812783

@@ -2791,7 +2793,7 @@ JL_DLLEXPORT void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz,
27912793

27922794
void *b;
27932795
if (isaligned)
2794-
b = realloc_a16(d, allocsz, oldsz);
2796+
b = realloc_cache_align(d, allocsz, oldsz);
27952797
else
27962798
b = realloc(d, allocsz);
27972799
if (b == NULL)

src/julia_internal.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,9 @@ STATIC_INLINE void jl_free_aligned(void *p)
501501
}
502502
#endif
503503

504+
#define JL_SMALL_BYTE_ALIGNMENT 16
505+
#define JL_CACHE_BYTE_ALIGNMENT 64
506+
504507
#ifdef __cplusplus
505508
}
506509
#endif

test/core.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3876,3 +3876,7 @@ module Test15264
38763876
mod1{T}(x::T) = x < 1 ? x : mod1(x-1)
38773877
end
38783878
@test Test15264.mod1 !== Base.mod1
3879+
3880+
3881+
# check that medium-sized array is 64-byte aligned (#15139)
3882+
@test Int(pointer(Vector{Float64}(1024))) % 64 == 0

0 commit comments

Comments
 (0)