|
| 1 | +//! MAIN DEFINES PROVIDED IN PRE_ATTACHED HEADER |
| 2 | + |
| 3 | +layout(local_size_x = SUBCELL_SIZE, local_size_y = SUBCELL_SIZE) in; |
| 4 | + |
| 5 | +layout(std140, binding = 0) uniform AutoExposureInput { |
| 6 | + vec4 autoExposureInput[]; |
| 7 | +}; |
| 8 | + |
| 9 | +layout(std430, binding = 0) restrict coherent buffer OutputData { |
| 10 | + uint packedHistogram[]; |
| 11 | +}; |
| 12 | + |
| 13 | +layout(binding=0) uniform sampler2D tex0; |
| 14 | + |
| 15 | +vec2 getTexCoord() |
| 16 | +{ |
| 17 | +/* // interleaved warp loading to reduce image locality (getting similar values and hence atomic contention) |
| 18 | + uint linear = gl_LocalInvocationID.x+32u*(gl_WorkGroupID.x+gl_WorkGroupID.y*gl_NumWorkGroups.x)+gl_LocalInvocationID.y*1024u*32u; |
| 19 | + uvec2 modifiedGlobalAddr = uvec2(linear&511u,linear>>9u); |
| 20 | + vec2 tc = vec2(modifiedGlobalAddr)*autoExposureInput[TEXSCALE_UBO_OFFSET].xy; |
| 21 | +*/ |
| 22 | + return vec2(gl_GlobalInvocationID.xy)*autoExposureInput[TEXSCALE_UBO_OFFSET].xy; |
| 23 | +} |
| 24 | + |
| 25 | + |
| 26 | +#define MIN_HISTOGRAM_VAL unpackHalf2x16(MIN_HISTOGRAM_RAW16F_AS_UINT).x |
| 27 | +#define MAX_HISTOGRAM_VAL unpackHalf2x16(MAX_HISTOGRAM_RAW16F_AS_UINT).x |
| 28 | + |
| 29 | +#define LOCAL_THREADS (SUBCELL_SIZE*SUBCELL_SIZE) |
| 30 | + |
| 31 | +//this is the only define you can hope to change and still have working code |
| 32 | +#define LOCAL_REPLICATION_POW 3u |
| 33 | + |
| 34 | +#define LOCAL_REPLICATION (1u<<LOCAL_REPLICATION_POW) |
| 35 | +#define PADDED_BIN_COUNTu (BIN_COUNTu+1u) |
| 36 | +shared uint histogram[PADDED_BIN_COUNTu*LOCAL_REPLICATION]; |
| 37 | + |
| 38 | +void clearHistogram() |
| 39 | +{ |
| 40 | + //clear histogram |
| 41 | + histogram[gl_LocalInvocationIndex+0u*LOCAL_THREADS] = 0u; |
| 42 | +#if LOCAL_REPLICATION_POW>=1u |
| 43 | + histogram[gl_LocalInvocationIndex+1u*LOCAL_THREADS] = 0u; |
| 44 | +#if LOCAL_REPLICATION_POW>=2u |
| 45 | + histogram[gl_LocalInvocationIndex+2u*LOCAL_THREADS] = 0u; |
| 46 | + histogram[gl_LocalInvocationIndex+3u*LOCAL_THREADS] = 0u; |
| 47 | +#if LOCAL_REPLICATION_POW>=3u |
| 48 | + histogram[gl_LocalInvocationIndex+4u*LOCAL_THREADS] = 0u; |
| 49 | + histogram[gl_LocalInvocationIndex+5u*LOCAL_THREADS] = 0u; |
| 50 | + histogram[gl_LocalInvocationIndex+6u*LOCAL_THREADS] = 0u; |
| 51 | + histogram[gl_LocalInvocationIndex+7u*LOCAL_THREADS] = 0u; |
| 52 | +#if LOCAL_REPLICATION_POW>=4u |
| 53 | + histogram[gl_LocalInvocationIndex+8u*LOCAL_THREADS] = 0u; |
| 54 | + histogram[gl_LocalInvocationIndex+9u*LOCAL_THREADS] = 0u; |
| 55 | + histogram[gl_LocalInvocationIndex+10u*LOCAL_THREADS] = 0u; |
| 56 | + histogram[gl_LocalInvocationIndex+11u*LOCAL_THREADS] = 0u; |
| 57 | + histogram[gl_LocalInvocationIndex+12u*LOCAL_THREADS] = 0u; |
| 58 | + histogram[gl_LocalInvocationIndex+13u*LOCAL_THREADS] = 0u; |
| 59 | + histogram[gl_LocalInvocationIndex+14u*LOCAL_THREADS] = 0u; |
| 60 | + histogram[gl_LocalInvocationIndex+15u*LOCAL_THREADS] = 0u; |
| 61 | +#endif // LOCAL_REPLICATION_POW |
| 62 | +#endif // LOCAL_REPLICATION_POW |
| 63 | +#endif // LOCAL_REPLICATION_POW |
| 64 | + if (gl_LocalInvocationIndex<LOCAL_REPLICATION) |
| 65 | + histogram[gl_LocalInvocationIndex+LOCAL_REPLICATION*LOCAL_THREADS] = 0u; |
| 66 | +#endif // LOCAL_REPLICATION_POW |
| 67 | + |
| 68 | + // what order to put the barriers? |
| 69 | + memoryBarrierShared(); |
| 70 | + barrier(); |
| 71 | +} |
| 72 | + |
| 73 | +void putInBin(in vec3 colorVal) |
| 74 | +{ |
| 75 | + float luma = clamp(dot(colorVal,kLumaConvertCoeff),MIN_HISTOGRAM_VAL,MAX_HISTOGRAM_VAL); |
| 76 | + int lumaBits = floatBitsToInt(luma); |
| 77 | + //lumaBits = (clamp(((lumaBits>>23)&0xff)-127+15,0,31)<<10)|((lumaBits>>13)&0x3ff); // convert a positive float32 to float16 |
| 78 | + //lumaBits = ((((lumaBits>>23)&0xff)-127+15)<<10)|((lumaBits>>13)&0x3ff); // convert a float32 in float16's range to float16 |
| 79 | + //optimized conversion |
| 80 | + //lumaBits = lumaBits>>13; |
| 81 | + //lumaBits = ((lumaBits&0x3fc00)-(127-15)*1024)|(lumaBits&0x3ff); |
| 82 | + //lumaBits -= MIN_HISTOGRAM_RAW16F_AS_UINT; |
| 83 | + // very optimized |
| 84 | + lumaBits = lumaBits>>(13u+HISTOGRAM_POT2_RAW16F_BIN_SIZE); |
| 85 | + const int exponentMask = 0x3fc00>>HISTOGRAM_POT2_RAW16F_BIN_SIZE; |
| 86 | + const int exponentOffset = (127-15)<<(10u-HISTOGRAM_POT2_RAW16F_BIN_SIZE); |
| 87 | + const int mantissaMask = 0x3ff>>HISTOGRAM_POT2_RAW16F_BIN_SIZE; |
| 88 | + lumaBits = ((lumaBits&exponentMask)-exponentOffset)|(lumaBits&mantissaMask); |
| 89 | + lumaBits -= MIN_HISTOGRAM_RAW16F_AS_UINT>>HISTOGRAM_POT2_RAW16F_BIN_SIZE; |
| 90 | + |
| 91 | + atomicAdd(histogram[lumaBits+(gl_LocalInvocationIndex&(LOCAL_REPLICATION-1u))*PADDED_BIN_COUNTu],1u); |
| 92 | +} |
| 93 | + |
| 94 | +void main() |
| 95 | +{ |
| 96 | + vec3 colorVal = textureLod(tex0,getTexCoord(),0.0).rgb; |
| 97 | + clearHistogram(); |
| 98 | + |
| 99 | + |
| 100 | + putInBin(colorVal); |
| 101 | + // what order to put the barriers? |
| 102 | + memoryBarrierShared(); |
| 103 | + barrier(); |
| 104 | + |
| 105 | + |
| 106 | + uint writeOutVal = histogram[gl_LocalInvocationIndex]; |
| 107 | +#if LOCAL_REPLICATION_POW>0u |
| 108 | + for (uint i=1u; i<LOCAL_REPLICATION; i++) |
| 109 | + writeOutVal += histogram[gl_LocalInvocationIndex+i*PADDED_BIN_COUNTu]; |
| 110 | +#endif |
| 111 | + |
| 112 | + atomicAdd(packedHistogram[gl_LocalInvocationIndex+(gl_WorkGroupID.x&(GLOBAL_REPLICATION-1u))*BIN_COUNTu],writeOutVal); |
| 113 | +} |
| 114 | + |
| 115 | +/* |
| 116 | +//#define GLOBAL_REPLICATION 16u |
| 117 | +//! Pretty naive with some warp-contention fighting gimmicks |
| 118 | +void globalVersion() |
| 119 | +{ |
| 120 | + vec3 colorVal = textureLod(tex0,getTexCoord(),0.0).rgb; |
| 121 | + |
| 122 | + uint baseThreadIx = gl_LocalInvocationIndex; |
| 123 | + |
| 124 | + float luma = clamp(dot(colorVal,kLumaConvertCoeff),MIN_HISTOGRAM_VAL,MAX_HISTOGRAM_VAL); |
| 125 | + int lumaBits = floatBitsToInt(luma); |
| 126 | + lumaBits = lumaBits>>(13+HISTOGRAM_POT2_RAW16F_BIN_SIZE); |
| 127 | + const int exponentMask = 0x3fc00>>HISTOGRAM_POT2_RAW16F_BIN_SIZE; |
| 128 | + const int exponentOffset = (127-15)<<(10-HISTOGRAM_POT2_RAW16F_BIN_SIZE); |
| 129 | + const int mantissaMask = 0x3ff>>HISTOGRAM_POT2_RAW16F_BIN_SIZE; |
| 130 | + lumaBits = ((lumaBits&exponentMask)-exponentOffset)|(lumaBits&mantissaMask); |
| 131 | + lumaBits -= MIN_HISTOGRAM_RAW16F_AS_UINT>>HISTOGRAM_POT2_RAW16F_BIN_SIZE; |
| 132 | + |
| 133 | + atomicAdd(packedHistogram[lumaBits+(baseThreadIx&(GLOBAL_REPLICATION-1u))*BIN_COUNTu],1u); |
| 134 | +} |
| 135 | +*/ |
0 commit comments