@@ -396,7 +396,7 @@ namespace StockhamGenerator
396
396
return ;
397
397
}
398
398
399
- size_t baseRadix[] = {7 ,5 ,3 ,2 }; // list only supported primes
399
+ size_t baseRadix[] = {13 , 11 , 7 ,5 ,3 ,2 }; // list only supported primes
400
400
size_t baseRadixSize = sizeof (baseRadix)/sizeof (baseRadix[0 ]);
401
401
402
402
size_t l = length;
@@ -437,7 +437,19 @@ namespace StockhamGenerator
437
437
{
438
438
workGroupSize = 49 ;
439
439
numTrans = length >= 7 *workGroupSize ? 1 : (7 *workGroupSize)/length;
440
- } else {
440
+ }
441
+ else if (primeFactorsExpanded[11 ] == length) // Length is pure power of 11
442
+ {
443
+ workGroupSize = 121 ;
444
+ numTrans = length >= 11 * workGroupSize ? 1 : (11 * workGroupSize) / length;
445
+ }
446
+ else if (primeFactorsExpanded[13 ] == length) // Length is pure power of 13
447
+ {
448
+ workGroupSize = 169 ;
449
+ numTrans = length >= 13 * workGroupSize ? 1 : (13 * workGroupSize) / length;
450
+ }
451
+ else
452
+ {
441
453
size_t leastNumPerWI = 1 ; // least number of elements in one work item
442
454
size_t maxWorkGroupSize = MAX_WGS; // maximum work group size desired
443
455
@@ -470,7 +482,14 @@ namespace StockhamGenerator
470
482
leastNumPerWI = 70 ; maxWorkGroupSize = 36 ;
471
483
} else if (primeFactorsExpanded[3 ] * primeFactorsExpanded[5 ] * primeFactorsExpanded[7 ] == length) {
472
484
leastNumPerWI =105 ; maxWorkGroupSize = 24 ;
473
- } else {
485
+ }
486
+ else if (primeFactorsExpanded[2 ] * primeFactorsExpanded[11 ] == length) {
487
+ leastNumPerWI = 22 ; maxWorkGroupSize = 128 ;
488
+ }
489
+ else if (primeFactorsExpanded[2 ] * primeFactorsExpanded[13 ] == length) {
490
+ leastNumPerWI = 26 ; maxWorkGroupSize = 128 ;
491
+ }
492
+ else {
474
493
leastNumPerWI =210 ; maxWorkGroupSize = 12 ;
475
494
}
476
495
if (pr==P_DOUBLE)
@@ -2025,7 +2044,7 @@ namespace StockhamGenerator
2025
2044
fft_postCallback = postcallbackParam;
2026
2045
}
2027
2046
2028
- void GeneratePass ( bool fwd, std::string &passStr, bool fft_3StepTwiddle,
2047
+ void GeneratePass ( bool fwd, std::string &passStr, bool fft_3StepTwiddle, bool twiddleFront,
2029
2048
bool inInterleaved, bool outInterleaved,
2030
2049
bool inReal, bool outReal,
2031
2050
size_t inStride, size_t outStride, double scale,
@@ -2495,7 +2514,7 @@ namespace StockhamGenerator
2495
2514
2496
2515
// 3-step twiddle multiplies done in the front
2497
2516
bool tw3Done = false ;
2498
- if (fft_3StepTwiddle && (position == 0 ) )
2517
+ if (fft_3StepTwiddle && twiddleFront )
2499
2518
{
2500
2519
tw3Done = true ;
2501
2520
if (linearRegs)
@@ -3019,7 +3038,7 @@ namespace StockhamGenerator
3019
3038
else
3020
3039
{
3021
3040
// Possible radices
3022
- size_t cRad[] = {10 ,8 ,7 ,6 ,5 ,4 ,3 ,2 ,1 }; // Must be in descending order
3041
+ size_t cRad[] = {13 , 11 , 10 ,8 ,7 ,6 ,5 ,4 ,3 ,2 ,1 }; // Must be in descending order
3023
3042
size_t cRadSize = (sizeof (cRad)/sizeof (cRad[0 ]));
3024
3043
3025
3044
// Generate the radix and pass objects
@@ -3233,32 +3252,12 @@ namespace StockhamGenerator
3233
3252
3234
3253
std::string sfx = FloatSuffix<PR>();
3235
3254
3255
+ // Base type
3256
+ str += " #define fptype " ; str += RegBaseType<PR>(1 ); str += " \n\n " ;
3257
+
3236
3258
// Vector type
3237
3259
str += " #define fvect2 " ; str += RegBaseType<PR>(2 ); str += " \n\n " ;
3238
3260
3239
- // constants
3240
- str += " #define C8Q 0.70710678118654752440084436210485" ; str += sfx; str += " \n " ;
3241
-
3242
- str += " #define C5QA 0.30901699437494742410229341718282" ; str += sfx; str += " \n " ;
3243
- str += " #define C5QB 0.95105651629515357211643933337938" ; str += sfx; str += " \n " ;
3244
- str += " #define C5QC 0.50000000000000000000000000000000" ; str += sfx; str += " \n " ;
3245
- str += " #define C5QD 0.58778525229247312916870595463907" ; str += sfx; str += " \n " ;
3246
- str += " #define C5QE 0.80901699437494742410229341718282" ; str += sfx; str += " \n " ;
3247
-
3248
- str += " #define C3QA 0.50000000000000000000000000000000" ; str += sfx; str += " \n " ;
3249
- str += " #define C3QB 0.86602540378443864676372317075294" ; str += sfx; str += " \n " ;
3250
-
3251
- str += " #define C7Q1 -1.16666666666666651863693004997913" ; str += sfx; str += " \n " ;
3252
- str += " #define C7Q2 0.79015646852540022404554065360571" ; str += sfx; str += " \n " ;
3253
- str += " #define C7Q3 0.05585426728964774240049351305970" ; str += sfx; str += " \n " ;
3254
- str += " #define C7Q4 0.73430220123575240531721419756650" ; str += sfx; str += " \n " ;
3255
- str += " #define C7Q5 0.44095855184409837868031445395900" ; str += sfx; str += " \n " ;
3256
- str += " #define C7Q6 0.34087293062393136944265847887436" ; str += sfx; str += " \n " ;
3257
- str += " #define C7Q7 -0.53396936033772524066165487965918" ; str += sfx; str += " \n " ;
3258
- str += " #define C7Q8 0.87484229096165666561546458979137" ; str += sfx; str += " \n " ;
3259
-
3260
- str += " \n " ;
3261
-
3262
3261
bool cReg = linearRegs ? true : false ;
3263
3262
3264
3263
// Generate butterflies for all unique radices
@@ -3269,6 +3268,86 @@ namespace StockhamGenerator
3269
3268
uradices.sort ();
3270
3269
uradices.unique ();
3271
3270
3271
+
3272
+ // constants
3273
+ if (length%8 == 0 )
3274
+ {
3275
+ str += " #define C8Q 0.70710678118654752440084436210485" ; str += sfx; str += " \n " ;
3276
+ }
3277
+
3278
+ if (length % 5 == 0 )
3279
+ {
3280
+ str += " #define C5QA 0.30901699437494742410229341718282" ; str += sfx; str += " \n " ;
3281
+ str += " #define C5QB 0.95105651629515357211643933337938" ; str += sfx; str += " \n " ;
3282
+ str += " #define C5QC 0.50000000000000000000000000000000" ; str += sfx; str += " \n " ;
3283
+ str += " #define C5QD 0.58778525229247312916870595463907" ; str += sfx; str += " \n " ;
3284
+ str += " #define C5QE 0.80901699437494742410229341718282" ; str += sfx; str += " \n " ;
3285
+ }
3286
+
3287
+ if (length % 3 == 0 )
3288
+ {
3289
+ str += " #define C3QA 0.50000000000000000000000000000000" ; str += sfx; str += " \n " ;
3290
+ str += " #define C3QB 0.86602540378443864676372317075294" ; str += sfx; str += " \n " ;
3291
+ }
3292
+
3293
+ if (length % 7 == 0 )
3294
+ {
3295
+ str += " #define C7Q1 -1.16666666666666651863693004997913" ; str += sfx; str += " \n " ;
3296
+ str += " #define C7Q2 0.79015646852540022404554065360571" ; str += sfx; str += " \n " ;
3297
+ str += " #define C7Q3 0.05585426728964774240049351305970" ; str += sfx; str += " \n " ;
3298
+ str += " #define C7Q4 0.73430220123575240531721419756650" ; str += sfx; str += " \n " ;
3299
+ str += " #define C7Q5 0.44095855184409837868031445395900" ; str += sfx; str += " \n " ;
3300
+ str += " #define C7Q6 0.34087293062393136944265847887436" ; str += sfx; str += " \n " ;
3301
+ str += " #define C7Q7 -0.53396936033772524066165487965918" ; str += sfx; str += " \n " ;
3302
+ str += " #define C7Q8 0.87484229096165666561546458979137" ; str += sfx; str += " \n " ;
3303
+ }
3304
+
3305
+ if (length % 11 == 0 )
3306
+ {
3307
+ str += " #define b11_0 0.9898214418809327" ; str += sfx; str += " \n " ;
3308
+ str += " #define b11_1 0.9594929736144973" ; str += sfx; str += " \n " ;
3309
+ str += " #define b11_2 0.9189859472289947" ; str += sfx; str += " \n " ;
3310
+ str += " #define b11_3 0.8767688310025893" ; str += sfx; str += " \n " ;
3311
+ str += " #define b11_4 0.8308300260037728" ; str += sfx; str += " \n " ;
3312
+ str += " #define b11_5 0.7784344533346518" ; str += sfx; str += " \n " ;
3313
+ str += " #define b11_6 0.7153703234534297" ; str += sfx; str += " \n " ;
3314
+ str += " #define b11_7 0.6343562706824244" ; str += sfx; str += " \n " ;
3315
+ str += " #define b11_8 0.3425847256816375" ; str += sfx; str += " \n " ;
3316
+ str += " #define b11_9 0.5211085581132027" ; str += sfx; str += " \n " ;
3317
+ }
3318
+
3319
+ if (length % 13 == 0 )
3320
+ {
3321
+ str += " #define b13_0 0.9682872443619840" ; str += sfx; str += " \n " ;
3322
+ str += " #define b13_1 0.9578059925946651" ; str += sfx; str += " \n " ;
3323
+ str += " #define b13_2 0.8755023024091479" ; str += sfx; str += " \n " ;
3324
+ str += " #define b13_3 0.8660254037844386" ; str += sfx; str += " \n " ;
3325
+ str += " #define b13_4 0.8595425350987748" ; str += sfx; str += " \n " ;
3326
+ str += " #define b13_5 0.8534800018598239" ; str += sfx; str += " \n " ;
3327
+ str += " #define b13_6 0.7693388175729806" ; str += sfx; str += " \n " ;
3328
+ str += " #define b13_7 0.6865583707817543" ; str += sfx; str += " \n " ;
3329
+ str += " #define b13_8 0.6122646503767565" ; str += sfx; str += " \n " ;
3330
+ str += " #define b13_9 0.6004772719326652" ; str += sfx; str += " \n " ;
3331
+ str += " #define b13_10 0.5817047785105157" ; str += sfx; str += " \n " ;
3332
+ str += " #define b13_11 0.5751407294740031" ; str += sfx; str += " \n " ;
3333
+ str += " #define b13_12 0.5220263851612750" ; str += sfx; str += " \n " ;
3334
+ str += " #define b13_13 0.5200285718888646" ; str += sfx; str += " \n " ;
3335
+ str += " #define b13_14 0.5165207806234897" ; str += sfx; str += " \n " ;
3336
+ str += " #define b13_15 0.5149187780863157" ; str += sfx; str += " \n " ;
3337
+ str += " #define b13_16 0.5035370328637666" ; str += sfx; str += " \n " ;
3338
+ str += " #define b13_17 0.5000000000000000" ; str += sfx; str += " \n " ;
3339
+ str += " #define b13_18 0.3027756377319946" ; str += sfx; str += " \n " ;
3340
+ str += " #define b13_19 0.3014792600477098" ; str += sfx; str += " \n " ;
3341
+ str += " #define b13_20 0.3004626062886657" ; str += sfx; str += " \n " ;
3342
+ str += " #define b13_21 0.2517685164318833" ; str += sfx; str += " \n " ;
3343
+ str += " #define b13_22 0.2261094450357824" ; str += sfx; str += " \n " ;
3344
+ str += " #define b13_23 0.0833333333333333" ; str += sfx; str += " \n " ;
3345
+ str += " #define b13_24 0.0386329546443481" ; str += sfx; str += " \n " ;
3346
+ }
3347
+
3348
+ str += " \n " ;
3349
+
3350
+
3272
3351
// If pre-callback is set for the plan
3273
3352
std::string callbackstr;
3274
3353
if (params.fft_hasPreCallback )
@@ -3351,7 +3430,7 @@ namespace StockhamGenerator
3351
3430
if ((p+1 ) != passes.end ()) { outIlvd = ldsInterleaved; }
3352
3431
}
3353
3432
3354
- p->GeneratePass (fwd, str, tw3Step, inIlvd, outIlvd, inRl, outRl, ins, outs, s, gIn , gOut );
3433
+ p->GeneratePass (fwd, str, tw3Step, params. fft_twiddleFront , inIlvd, outIlvd, inRl, outRl, ins, outs, s, gIn , gOut );
3355
3434
}
3356
3435
3357
3436
// if real transform we do only 1 direction
0 commit comments