Skip to content

Commit b7da307

Browse files
committed
Merge pull request clMathLibraries#142 from bragadeesh/develop
Add radix 11/13 to support powers of 11/13 transform sizes
2 parents 4e67415 + 237f0d0 commit b7da307

14 files changed

+1142
-278
lines changed

src/library/action.transpose.cpp

+1-4
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,6 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
223223
}
224224
}
225225
OPENCL_V(clfft_transpose_generator::genTransposeKernelLeadingDimensionBatched(this->signature, programCode, lwSize, reShapeFactor), _T("genTransposeKernel() failed!"));
226-
//std::cout << programCode << std::endl;//TIMMY
227226
}
228227
else if (this->signature.nonSquareKernelType == NON_SQUARE_TRANS_TRANSPOSE_BATCHED)
229228
{
@@ -247,7 +246,6 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
247246
}
248247
}
249248
OPENCL_V(clfft_transpose_generator::genTransposeKernelBatched(this->signature, programCode, lwSize, reShapeFactor), _T("genTransposeKernel() failed!"));
250-
//std::cout << programCode << std::endl;//TIMMY
251249
}
252250
else
253251
{
@@ -283,7 +281,6 @@ clfftStatus FFTGeneratedTransposeNonSquareAction::generateKernel(FFTRepo& fftRep
283281
*/
284282
//general swap kernel takes care of all ratio
285283
OPENCL_V(clfft_transpose_generator::genSwapKernelGeneral(this->signature, programCode, kernelFuncName, lwSize, reShapeFactor), _T("genSwapKernel() failed!"));
286-
//std::cout << programCode << std::endl;//TIMMY
287284
}
288285

289286
cl_int status = CL_SUCCESS;
@@ -720,7 +717,7 @@ clfftStatus FFTGeneratedTransposeSquareAction::generateKernel(FFTRepo& fftRepo,
720717
{
721718
OPENCL_V(fftRepo.setProgramEntryPoints(Transpose_SQUARE, this->getSignatureData(), "transpose_square", "transpose_square", Device, QueueContext), _T("fftRepo.setProgramEntryPoint() failed!"));
722719
}
723-
//std::cout << programCode << std::endl;//TIMMY
720+
724721
return CLFFT_SUCCESS;
725722
}
726723

src/library/generator.stockham.cpp

+109-30
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,7 @@ namespace StockhamGenerator
396396
return;
397397
}
398398

399-
size_t baseRadix[] = {7,5,3,2}; // list only supported primes
399+
size_t baseRadix[] = {13,11,7,5,3,2}; // list only supported primes
400400
size_t baseRadixSize = sizeof(baseRadix)/sizeof(baseRadix[0]);
401401

402402
size_t l = length;
@@ -437,7 +437,19 @@ namespace StockhamGenerator
437437
{
438438
workGroupSize = 49;
439439
numTrans = length >= 7*workGroupSize ? 1 : (7*workGroupSize)/length;
440-
} else {
440+
}
441+
else if (primeFactorsExpanded[11] == length) // Length is pure power of 11
442+
{
443+
workGroupSize = 121;
444+
numTrans = length >= 11 * workGroupSize ? 1 : (11 * workGroupSize) / length;
445+
}
446+
else if (primeFactorsExpanded[13] == length) // Length is pure power of 13
447+
{
448+
workGroupSize = 169;
449+
numTrans = length >= 13 * workGroupSize ? 1 : (13 * workGroupSize) / length;
450+
}
451+
else
452+
{
441453
size_t leastNumPerWI = 1; // least number of elements in one work item
442454
size_t maxWorkGroupSize = MAX_WGS; // maximum work group size desired
443455

@@ -470,7 +482,14 @@ namespace StockhamGenerator
470482
leastNumPerWI = 70; maxWorkGroupSize = 36;
471483
} else if (primeFactorsExpanded[3] * primeFactorsExpanded[5] * primeFactorsExpanded[7] == length) {
472484
leastNumPerWI =105; maxWorkGroupSize = 24;
473-
} else {
485+
}
486+
else if (primeFactorsExpanded[2] * primeFactorsExpanded[11] == length) {
487+
leastNumPerWI = 22; maxWorkGroupSize = 128;
488+
}
489+
else if (primeFactorsExpanded[2] * primeFactorsExpanded[13] == length) {
490+
leastNumPerWI = 26; maxWorkGroupSize = 128;
491+
}
492+
else {
474493
leastNumPerWI =210; maxWorkGroupSize = 12;
475494
}
476495
if (pr==P_DOUBLE)
@@ -2025,7 +2044,7 @@ namespace StockhamGenerator
20252044
fft_postCallback = postcallbackParam;
20262045
}
20272046

2028-
void GeneratePass( bool fwd, std::string &passStr, bool fft_3StepTwiddle,
2047+
void GeneratePass( bool fwd, std::string &passStr, bool fft_3StepTwiddle, bool twiddleFront,
20292048
bool inInterleaved, bool outInterleaved,
20302049
bool inReal, bool outReal,
20312050
size_t inStride, size_t outStride, double scale,
@@ -2495,7 +2514,7 @@ namespace StockhamGenerator
24952514

24962515
// 3-step twiddle multiplies done in the front
24972516
bool tw3Done = false;
2498-
if(fft_3StepTwiddle && (position == 0))
2517+
if(fft_3StepTwiddle && twiddleFront)
24992518
{
25002519
tw3Done = true;
25012520
if(linearRegs)
@@ -3019,7 +3038,7 @@ namespace StockhamGenerator
30193038
else
30203039
{
30213040
// Possible radices
3022-
size_t cRad[] = {10,8,7,6,5,4,3,2,1}; // Must be in descending order
3041+
size_t cRad[] = {13,11,10,8,7,6,5,4,3,2,1}; // Must be in descending order
30233042
size_t cRadSize = (sizeof(cRad)/sizeof(cRad[0]));
30243043

30253044
// Generate the radix and pass objects
@@ -3233,32 +3252,12 @@ namespace StockhamGenerator
32333252

32343253
std::string sfx = FloatSuffix<PR>();
32353254

3255+
// Base type
3256+
str += "#define fptype "; str += RegBaseType<PR>(1); str += "\n\n";
3257+
32363258
// Vector type
32373259
str += "#define fvect2 "; str += RegBaseType<PR>(2); str += "\n\n";
32383260

3239-
//constants
3240-
str += "#define C8Q 0.70710678118654752440084436210485"; str += sfx; str += "\n";
3241-
3242-
str += "#define C5QA 0.30901699437494742410229341718282"; str += sfx; str += "\n";
3243-
str += "#define C5QB 0.95105651629515357211643933337938"; str += sfx; str += "\n";
3244-
str += "#define C5QC 0.50000000000000000000000000000000"; str += sfx; str += "\n";
3245-
str += "#define C5QD 0.58778525229247312916870595463907"; str += sfx; str += "\n";
3246-
str += "#define C5QE 0.80901699437494742410229341718282"; str += sfx; str += "\n";
3247-
3248-
str += "#define C3QA 0.50000000000000000000000000000000"; str += sfx; str += "\n";
3249-
str += "#define C3QB 0.86602540378443864676372317075294"; str += sfx; str += "\n";
3250-
3251-
str += "#define C7Q1 -1.16666666666666651863693004997913"; str += sfx; str += "\n";
3252-
str += "#define C7Q2 0.79015646852540022404554065360571"; str += sfx; str += "\n";
3253-
str += "#define C7Q3 0.05585426728964774240049351305970"; str += sfx; str += "\n";
3254-
str += "#define C7Q4 0.73430220123575240531721419756650"; str += sfx; str += "\n";
3255-
str += "#define C7Q5 0.44095855184409837868031445395900"; str += sfx; str += "\n";
3256-
str += "#define C7Q6 0.34087293062393136944265847887436"; str += sfx; str += "\n";
3257-
str += "#define C7Q7 -0.53396936033772524066165487965918"; str += sfx; str += "\n";
3258-
str += "#define C7Q8 0.87484229096165666561546458979137"; str += sfx; str += "\n";
3259-
3260-
str += "\n";
3261-
32623261
bool cReg = linearRegs ? true : false;
32633262

32643263
// Generate butterflies for all unique radices
@@ -3269,6 +3268,86 @@ namespace StockhamGenerator
32693268
uradices.sort();
32703269
uradices.unique();
32713270

3271+
3272+
//constants
3273+
if (length%8 == 0)
3274+
{
3275+
str += "#define C8Q 0.70710678118654752440084436210485"; str += sfx; str += "\n";
3276+
}
3277+
3278+
if (length % 5 == 0)
3279+
{
3280+
str += "#define C5QA 0.30901699437494742410229341718282"; str += sfx; str += "\n";
3281+
str += "#define C5QB 0.95105651629515357211643933337938"; str += sfx; str += "\n";
3282+
str += "#define C5QC 0.50000000000000000000000000000000"; str += sfx; str += "\n";
3283+
str += "#define C5QD 0.58778525229247312916870595463907"; str += sfx; str += "\n";
3284+
str += "#define C5QE 0.80901699437494742410229341718282"; str += sfx; str += "\n";
3285+
}
3286+
3287+
if (length % 3 == 0)
3288+
{
3289+
str += "#define C3QA 0.50000000000000000000000000000000"; str += sfx; str += "\n";
3290+
str += "#define C3QB 0.86602540378443864676372317075294"; str += sfx; str += "\n";
3291+
}
3292+
3293+
if (length % 7 == 0)
3294+
{
3295+
str += "#define C7Q1 -1.16666666666666651863693004997913"; str += sfx; str += "\n";
3296+
str += "#define C7Q2 0.79015646852540022404554065360571"; str += sfx; str += "\n";
3297+
str += "#define C7Q3 0.05585426728964774240049351305970"; str += sfx; str += "\n";
3298+
str += "#define C7Q4 0.73430220123575240531721419756650"; str += sfx; str += "\n";
3299+
str += "#define C7Q5 0.44095855184409837868031445395900"; str += sfx; str += "\n";
3300+
str += "#define C7Q6 0.34087293062393136944265847887436"; str += sfx; str += "\n";
3301+
str += "#define C7Q7 -0.53396936033772524066165487965918"; str += sfx; str += "\n";
3302+
str += "#define C7Q8 0.87484229096165666561546458979137"; str += sfx; str += "\n";
3303+
}
3304+
3305+
if (length % 11 == 0)
3306+
{
3307+
str += "#define b11_0 0.9898214418809327"; str += sfx; str += "\n";
3308+
str += "#define b11_1 0.9594929736144973"; str += sfx; str += "\n";
3309+
str += "#define b11_2 0.9189859472289947"; str += sfx; str += "\n";
3310+
str += "#define b11_3 0.8767688310025893"; str += sfx; str += "\n";
3311+
str += "#define b11_4 0.8308300260037728"; str += sfx; str += "\n";
3312+
str += "#define b11_5 0.7784344533346518"; str += sfx; str += "\n";
3313+
str += "#define b11_6 0.7153703234534297"; str += sfx; str += "\n";
3314+
str += "#define b11_7 0.6343562706824244"; str += sfx; str += "\n";
3315+
str += "#define b11_8 0.3425847256816375"; str += sfx; str += "\n";
3316+
str += "#define b11_9 0.5211085581132027"; str += sfx; str += "\n";
3317+
}
3318+
3319+
if (length % 13 == 0)
3320+
{
3321+
str += "#define b13_0 0.9682872443619840"; str += sfx; str += "\n";
3322+
str += "#define b13_1 0.9578059925946651"; str += sfx; str += "\n";
3323+
str += "#define b13_2 0.8755023024091479"; str += sfx; str += "\n";
3324+
str += "#define b13_3 0.8660254037844386"; str += sfx; str += "\n";
3325+
str += "#define b13_4 0.8595425350987748"; str += sfx; str += "\n";
3326+
str += "#define b13_5 0.8534800018598239"; str += sfx; str += "\n";
3327+
str += "#define b13_6 0.7693388175729806"; str += sfx; str += "\n";
3328+
str += "#define b13_7 0.6865583707817543"; str += sfx; str += "\n";
3329+
str += "#define b13_8 0.6122646503767565"; str += sfx; str += "\n";
3330+
str += "#define b13_9 0.6004772719326652"; str += sfx; str += "\n";
3331+
str += "#define b13_10 0.5817047785105157"; str += sfx; str += "\n";
3332+
str += "#define b13_11 0.5751407294740031"; str += sfx; str += "\n";
3333+
str += "#define b13_12 0.5220263851612750"; str += sfx; str += "\n";
3334+
str += "#define b13_13 0.5200285718888646"; str += sfx; str += "\n";
3335+
str += "#define b13_14 0.5165207806234897"; str += sfx; str += "\n";
3336+
str += "#define b13_15 0.5149187780863157"; str += sfx; str += "\n";
3337+
str += "#define b13_16 0.5035370328637666"; str += sfx; str += "\n";
3338+
str += "#define b13_17 0.5000000000000000"; str += sfx; str += "\n";
3339+
str += "#define b13_18 0.3027756377319946"; str += sfx; str += "\n";
3340+
str += "#define b13_19 0.3014792600477098"; str += sfx; str += "\n";
3341+
str += "#define b13_20 0.3004626062886657"; str += sfx; str += "\n";
3342+
str += "#define b13_21 0.2517685164318833"; str += sfx; str += "\n";
3343+
str += "#define b13_22 0.2261094450357824"; str += sfx; str += "\n";
3344+
str += "#define b13_23 0.0833333333333333"; str += sfx; str += "\n";
3345+
str += "#define b13_24 0.0386329546443481"; str += sfx; str += "\n";
3346+
}
3347+
3348+
str += "\n";
3349+
3350+
32723351
//If pre-callback is set for the plan
32733352
std::string callbackstr;
32743353
if (params.fft_hasPreCallback)
@@ -3351,7 +3430,7 @@ namespace StockhamGenerator
33513430
if((p+1) != passes.end()) { outIlvd = ldsInterleaved; }
33523431
}
33533432

3354-
p->GeneratePass(fwd, str, tw3Step, inIlvd, outIlvd, inRl, outRl, ins, outs, s, gIn, gOut);
3433+
p->GeneratePass(fwd, str, tw3Step, params.fft_twiddleFront, inIlvd, outIlvd, inRl, outRl, ins, outs, s, gIn, gOut);
33553434
}
33563435

33573436
// if real transform we do only 1 direction

0 commit comments

Comments
 (0)