Skip to content

Commit bcd8c2a

Browse files
committed
Speed up putImageData for RGBA canvases
1 parent 0d9ca88 commit bcd8c2a

File tree

3 files changed

+107
-29
lines changed

3 files changed

+107
-29
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ project adheres to [Semantic Versioning](http://semver.org/).
1212
* Switch prebuilds to GitHub actions in the Automattic/node-canvas repository.
1313
Previously these were in the [node-gfx/node-canvas-prebuilt](https://github.com/node-gfx/node-canvas-prebuilt)
1414
and triggered manually.
15+
* Speed up `putImageData` for RGBA32 canvases.
1516
### Added
1617
* Export `rsvgVersion`.
1718
### Fixed

benchmarks/run.js

+23-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
* milliseconds to complete.
55
*/
66

7-
var createCanvas = require('../').createCanvas
7+
var { createCanvas, ImageData } = require('../')
88
var canvas = createCanvas(200, 200)
99
var largeCanvas = createCanvas(1000, 1000)
1010
var ctx = canvas.getContext('2d')
@@ -64,6 +64,28 @@ function done (benchmark, times, start, isAsync) {
6464

6565
// node-canvas
6666

67+
const id0 = new ImageData(200, 200)
68+
69+
bm('putImageData, all a=0', function () {
70+
ctx.putImageData(id0, 0, 0)
71+
})
72+
73+
const id255 = new ImageData(200, 200)
74+
id255.data.fill(0xFF)
75+
76+
bm('putImageData, all a=0xFF', function () {
77+
ctx.putImageData(id255, 0, 0)
78+
})
79+
80+
const idRand = new ImageData(200, 200)
81+
for (let i = 0; i < idRand.data.length; i++) {
82+
idRand.data[i] = 255 * Math.random()
83+
}
84+
85+
bm('putImageData, mixed a', function () {
86+
ctx.putImageData(idRand, 0, 0)
87+
})
88+
6789
bm('fillStyle= name', function () {
6890
ctx.fillStyle = 'transparent'
6991
})

src/CanvasRenderingContext2d.cc

+83-28
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,29 @@
2121

2222
using namespace v8;
2323

24-
// Windows doesn't support the C99 names for these
2524
#ifdef _MSC_VER
26-
#define isnan(x) _isnan(x)
27-
#define isinf(x) (!_finite(x))
25+
// Windows doesn't support the C99 names for these. TODO unnecessary,
26+
// should be using std::isnan.
27+
# define isnan(x) _isnan(x)
28+
# define isinf(x) (!_finite(x))
29+
# include <intrin.h>
30+
# define bswap32 _byteswap_ulong
31+
#else
32+
# ifdef __x86_64__
33+
# include <x86intrin.h>
34+
# endif
35+
# define bswap32 __builtin_bswap32
2836
#endif
2937

38+
static inline uint32_t rotr(uint32_t n, unsigned int c) {
39+
// GCC has no portable _rotr intrinsic, so rely on idiom recognition. Works
40+
// for all supported versions of MSVC, GCC x86, GCC ARM, Clang.
41+
// https://stackoverflow.com/a/776523/1218408
42+
const unsigned int mask = CHAR_BIT * sizeof(n) - 1;
43+
c &= mask;
44+
return (n >> c) | (n << ((~c + 1) & mask));
45+
}
46+
3047
#ifndef isnan
3148
#define isnan(x) std::isnan(x)
3249
#define isinf(x) std::isinf(x)
@@ -852,32 +869,70 @@ NAN_METHOD(Context2d::PutImageData) {
852869
for (int y = 0; y < rows; ++y) {
853870
uint8_t *dstRow = dst;
854871
uint8_t *srcRow = src;
855-
for (int x = 0; x < cols; ++x) {
856-
// rgba
857-
uint8_t r = *srcRow++;
858-
uint8_t g = *srcRow++;
859-
uint8_t b = *srcRow++;
860-
uint8_t a = *srcRow++;
872+
#if defined(__x86_64__) || defined(_M_X64)
873+
int x = 0;
874+
for (; x < cols - 1; x += 2) { // Two columns at a time
875+
// Fast path if both alphas are 0.
876+
uint64_t px64;
877+
memcpy(&px64, srcRow, 8);
878+
const uint64_t aMask = 0xFF000000'FF000000;
879+
const uint64_t aOnly = px64 & aMask;
880+
if (aOnly == 0) {
881+
memset(dstRow, 0, 8);
882+
dstRow += 8;
883+
srcRow += 8;
884+
continue;
885+
}
861886

862-
// argb
863-
// performance optimization: fully transparent/opaque pixels can be
864-
// processed more efficiently.
887+
__m128i px;
888+
memcpy(&px, srcRow, 8); // gcc doesn't define _mm_loadu_si64
889+
px = _mm_unpacklo_epi8(px, _mm_setzero_si128());
890+
// rgba -> bgra
891+
px = _mm_shufflelo_epi16(px, 0b11000110);
892+
px = _mm_shufflehi_epi16(px, 0b11000110);
893+
894+
// Fast path if both alphas are 255.
895+
if (aOnly != aMask) {
896+
// broadcast alpha
897+
__m128i av = _mm_shufflelo_epi16(px, 0b11111111);
898+
av = _mm_shufflehi_epi16(av, 0b11111111);
899+
// Multiply by alpha.
900+
// Set alpha channel multiplier to 255 to undo upcoming division by 255
901+
const __m128i a255 = _mm_set_epi16(0xFF, 0, 0, 0, 0xFF, 0, 0, 0);
902+
av = _mm_or_si128(av, a255);
903+
px = _mm_mullo_epi16(px, av);
904+
// divide by 255
905+
px = _mm_mulhi_epu16(px, _mm_set1_epi16(0x8081));
906+
px = _mm_srli_epi16(px, 7);
907+
}
908+
909+
// pack int16 to int8
910+
px = _mm_packus_epi16(px, px);
911+
memcpy(dstRow, &px, 8);
912+
dstRow += 8;
913+
srcRow += 8;
914+
}
915+
if (cols & 1) {
916+
#else
917+
for (int x = 0; x < cols; x++) {
918+
#endif
919+
uint32_t c;
920+
memcpy(&c, srcRow, 4); // rgba (LE)
921+
srcRow += 4;
922+
uint32_t a = c >> 24;
865923
if (a == 0) {
866-
*dstRow++ = 0;
867-
*dstRow++ = 0;
868-
*dstRow++ = 0;
869-
*dstRow++ = 0;
870-
} else if (a == 255) {
871-
*dstRow++ = b;
872-
*dstRow++ = g;
873-
*dstRow++ = r;
874-
*dstRow++ = a;
924+
uint32_t zero = 0;
925+
memcpy(dstRow, &zero, 4);
926+
} else if (a == 255) { // rgba (LE)
927+
c = bswap32(c); // abgr
928+
c = rotr(c, 8); // bgra
929+
memcpy(dstRow, &c, 4);
875930
} else {
876-
float alpha = (float)a / 255;
877-
*dstRow++ = b * alpha;
878-
*dstRow++ = g * alpha;
879-
*dstRow++ = r * alpha;
880-
*dstRow++ = a;
931+
uint8_t r = (c & 0xFF) * a / 255;
932+
uint8_t g = (c >> 8 & 0xFF) * a / 255;
933+
uint8_t b = (c >> 16 & 0xFF) * a / 255;
934+
uint32_t bgra = (a << 24) | (r << 16) | (g << 8) | b;
935+
memcpy(dstRow, &bgra, 4);
881936
}
882937
}
883938
dst += dstStride;
@@ -892,13 +947,13 @@ NAN_METHOD(Context2d::PutImageData) {
892947
uint8_t *dstRow = dst;
893948
uint8_t *srcRow = src;
894949
for (int x = 0; x < cols; ++x) {
895-
// rgba
950+
// rgb[a]
896951
uint8_t r = *srcRow++;
897952
uint8_t g = *srcRow++;
898953
uint8_t b = *srcRow++;
899954
srcRow++;
900955

901-
// argb
956+
// bgra
902957
*dstRow++ = b;
903958
*dstRow++ = g;
904959
*dstRow++ = r;

0 commit comments

Comments
 (0)