Skip to content

Commit e631a9b

Browse files
Last touch before a long break from ex. 26
1 parent a9ebee6 commit e631a9b

File tree

3 files changed

+136
-3
lines changed

3 files changed

+136
-3
lines changed
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
//! MAIN DEFINES PROVIDED IN PRE_ATTACHED HEADER
2+
3+
layout(local_size_x = SUBCELL_SIZE, local_size_y = SUBCELL_SIZE) in;
4+
5+
layout(std140, binding = 0) uniform AutoExposureInput {
6+
vec4 autoExposureInput[];
7+
};
8+
9+
layout(std430, binding = 0) restrict coherent buffer OutputData {
10+
uint packedHistogram[];
11+
};
12+
13+
layout(binding=0) uniform sampler2D tex0;
14+
15+
vec2 getTexCoord()
16+
{
17+
/* // interleaved warp loading to reduce image locality (getting similar values and hence atomic contention)
18+
uint linear = gl_LocalInvocationID.x+32u*(gl_WorkGroupID.x+gl_WorkGroupID.y*gl_NumWorkGroups.x)+gl_LocalInvocationID.y*1024u*32u;
19+
uvec2 modifiedGlobalAddr = uvec2(linear&511u,linear>>9u);
20+
vec2 tc = vec2(modifiedGlobalAddr)*autoExposureInput[TEXSCALE_UBO_OFFSET].xy;
21+
*/
22+
return vec2(gl_GlobalInvocationID.xy)*autoExposureInput[TEXSCALE_UBO_OFFSET].xy;
23+
}
24+
25+
26+
#define MIN_HISTOGRAM_VAL unpackHalf2x16(MIN_HISTOGRAM_RAW16F_AS_UINT).x
27+
#define MAX_HISTOGRAM_VAL unpackHalf2x16(MAX_HISTOGRAM_RAW16F_AS_UINT).x
28+
29+
#define LOCAL_THREADS (SUBCELL_SIZE*SUBCELL_SIZE)
30+
31+
//this is the only define you can hope to change and still have working code
32+
#define LOCAL_REPLICATION_POW 3u
33+
34+
#define LOCAL_REPLICATION (1u<<LOCAL_REPLICATION_POW)
35+
#define PADDED_BIN_COUNTu (BIN_COUNTu+1u)
36+
shared uint histogram[PADDED_BIN_COUNTu*LOCAL_REPLICATION];
37+
38+
void clearHistogram()
39+
{
40+
//clear histogram
41+
histogram[gl_LocalInvocationIndex+0u*LOCAL_THREADS] = 0u;
42+
#if LOCAL_REPLICATION_POW>=1u
43+
histogram[gl_LocalInvocationIndex+1u*LOCAL_THREADS] = 0u;
44+
#if LOCAL_REPLICATION_POW>=2u
45+
histogram[gl_LocalInvocationIndex+2u*LOCAL_THREADS] = 0u;
46+
histogram[gl_LocalInvocationIndex+3u*LOCAL_THREADS] = 0u;
47+
#if LOCAL_REPLICATION_POW>=3u
48+
histogram[gl_LocalInvocationIndex+4u*LOCAL_THREADS] = 0u;
49+
histogram[gl_LocalInvocationIndex+5u*LOCAL_THREADS] = 0u;
50+
histogram[gl_LocalInvocationIndex+6u*LOCAL_THREADS] = 0u;
51+
histogram[gl_LocalInvocationIndex+7u*LOCAL_THREADS] = 0u;
52+
#if LOCAL_REPLICATION_POW>=4u
53+
histogram[gl_LocalInvocationIndex+8u*LOCAL_THREADS] = 0u;
54+
histogram[gl_LocalInvocationIndex+9u*LOCAL_THREADS] = 0u;
55+
histogram[gl_LocalInvocationIndex+10u*LOCAL_THREADS] = 0u;
56+
histogram[gl_LocalInvocationIndex+11u*LOCAL_THREADS] = 0u;
57+
histogram[gl_LocalInvocationIndex+12u*LOCAL_THREADS] = 0u;
58+
histogram[gl_LocalInvocationIndex+13u*LOCAL_THREADS] = 0u;
59+
histogram[gl_LocalInvocationIndex+14u*LOCAL_THREADS] = 0u;
60+
histogram[gl_LocalInvocationIndex+15u*LOCAL_THREADS] = 0u;
61+
#endif // LOCAL_REPLICATION_POW
62+
#endif // LOCAL_REPLICATION_POW
63+
#endif // LOCAL_REPLICATION_POW
64+
if (gl_LocalInvocationIndex<LOCAL_REPLICATION)
65+
histogram[gl_LocalInvocationIndex+LOCAL_REPLICATION*LOCAL_THREADS] = 0u;
66+
#endif // LOCAL_REPLICATION_POW
67+
68+
// what order to put the barriers?
69+
memoryBarrierShared();
70+
barrier();
71+
}
72+
73+
void putInBin(in vec3 colorVal)
74+
{
75+
float luma = clamp(dot(colorVal,kLumaConvertCoeff),MIN_HISTOGRAM_VAL,MAX_HISTOGRAM_VAL);
76+
int lumaBits = floatBitsToInt(luma);
77+
//lumaBits = (clamp(((lumaBits>>23)&0xff)-127+15,0,31)<<10)|((lumaBits>>13)&0x3ff); // convert a positive float32 to float16
78+
//lumaBits = ((((lumaBits>>23)&0xff)-127+15)<<10)|((lumaBits>>13)&0x3ff); // convert a float32 in float16's range to float16
79+
//optimized conversion
80+
//lumaBits = lumaBits>>13;
81+
//lumaBits = ((lumaBits&0x3fc00)-(127-15)*1024)|(lumaBits&0x3ff);
82+
//lumaBits -= MIN_HISTOGRAM_RAW16F_AS_UINT;
83+
// very optimized
84+
lumaBits = lumaBits>>(13u+HISTOGRAM_POT2_RAW16F_BIN_SIZE);
85+
const int exponentMask = 0x3fc00>>HISTOGRAM_POT2_RAW16F_BIN_SIZE;
86+
const int exponentOffset = (127-15)<<(10u-HISTOGRAM_POT2_RAW16F_BIN_SIZE);
87+
const int mantissaMask = 0x3ff>>HISTOGRAM_POT2_RAW16F_BIN_SIZE;
88+
lumaBits = ((lumaBits&exponentMask)-exponentOffset)|(lumaBits&mantissaMask);
89+
lumaBits -= MIN_HISTOGRAM_RAW16F_AS_UINT>>HISTOGRAM_POT2_RAW16F_BIN_SIZE;
90+
91+
atomicAdd(histogram[lumaBits+(gl_LocalInvocationIndex&(LOCAL_REPLICATION-1u))*PADDED_BIN_COUNTu],1u);
92+
}
93+
94+
void main()
95+
{
96+
vec3 colorVal = textureLod(tex0,getTexCoord(),0.0).rgb;
97+
clearHistogram();
98+
99+
100+
putInBin(colorVal);
101+
// what order to put the barriers?
102+
memoryBarrierShared();
103+
barrier();
104+
105+
106+
uint writeOutVal = histogram[gl_LocalInvocationIndex];
107+
#if LOCAL_REPLICATION_POW>0u
108+
for (uint i=1u; i<LOCAL_REPLICATION; i++)
109+
writeOutVal += histogram[gl_LocalInvocationIndex+i*PADDED_BIN_COUNTu];
110+
#endif
111+
112+
atomicAdd(packedHistogram[gl_LocalInvocationIndex+(gl_WorkGroupID.x&(GLOBAL_REPLICATION-1u))*BIN_COUNTu],writeOutVal);
113+
}
114+
115+
/*
116+
//#define GLOBAL_REPLICATION 16u
117+
//! Pretty naive with some warp-contention fighting gimmicks
118+
void globalVersion()
119+
{
120+
vec3 colorVal = textureLod(tex0,getTexCoord(),0.0).rgb;
121+
122+
uint baseThreadIx = gl_LocalInvocationIndex;
123+
124+
float luma = clamp(dot(colorVal,kLumaConvertCoeff),MIN_HISTOGRAM_VAL,MAX_HISTOGRAM_VAL);
125+
int lumaBits = floatBitsToInt(luma);
126+
lumaBits = lumaBits>>(13+HISTOGRAM_POT2_RAW16F_BIN_SIZE);
127+
const int exponentMask = 0x3fc00>>HISTOGRAM_POT2_RAW16F_BIN_SIZE;
128+
const int exponentOffset = (127-15)<<(10-HISTOGRAM_POT2_RAW16F_BIN_SIZE);
129+
const int mantissaMask = 0x3ff>>HISTOGRAM_POT2_RAW16F_BIN_SIZE;
130+
lumaBits = ((lumaBits&exponentMask)-exponentOffset)|(lumaBits&mantissaMask);
131+
lumaBits -= MIN_HISTOGRAM_RAW16F_AS_UINT>>HISTOGRAM_POT2_RAW16F_BIN_SIZE;
132+
133+
atomicAdd(packedHistogram[lumaBits+(baseThreadIx&(GLOBAL_REPLICATION-1u))*BIN_COUNTu],1u);
134+
}
135+
*/

examples_tests/26.MultidrawIndirectVSCPUCull/main.cpp

+1-3
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,6 @@ int main()
139139

140140
#define kInstanceCount 4096
141141
#define kTotalTriangleLimit (64*1024*1024)
142-
#define kMinTriangleLimit 64
143142

144143
scene::ICameraSceneNode* camera =
145144
smgr->addCameraSceneNodeFPS(0,100.0f,0.01f);
@@ -183,8 +182,7 @@ int main()
183182

184183
std::random_device rd;
185184
std::mt19937 mt(rd());
186-
//std::uniform_int_distribution<uint32_t> dist(kMinTriangleLimit, kTotalTriangleLimit*2/kInstanceCount-kMinTriangleLimit);
187-
std::uniform_int_distribution<uint32_t> dist(kMinTriangleLimit, kMinTriangleLimit*18);
185+
std::uniform_int_distribution<uint32_t> dist(16, 4*1024);
188186
for (size_t i=0; i<kInstanceCount; i++)
189187
{
190188
float poly = sqrtf(dist(mt))+0.5f;

0 commit comments

Comments
 (0)