@@ -396,7 +396,7 @@ namespace StockhamGenerator
396396 return ;
397397 }
398398
399- size_t baseRadix[] = {7 ,5 ,3 ,2 }; // list only supported primes
399+ size_t baseRadix[] = {13 , 11 , 7 ,5 ,3 ,2 }; // list only supported primes
400400 size_t baseRadixSize = sizeof (baseRadix)/sizeof (baseRadix[0 ]);
401401
402402 size_t l = length;
@@ -437,7 +437,19 @@ namespace StockhamGenerator
437437 {
438438 workGroupSize = 49 ;
439439 numTrans = length >= 7 *workGroupSize ? 1 : (7 *workGroupSize)/length;
440- } else {
440+ }
441+ else if (primeFactorsExpanded[11 ] == length) // Length is pure power of 11
442+ {
443+ workGroupSize = 121 ;
444+ numTrans = length >= 11 * workGroupSize ? 1 : (11 * workGroupSize) / length;
445+ }
446+ else if (primeFactorsExpanded[13 ] == length) // Length is pure power of 13
447+ {
448+ workGroupSize = 169 ;
449+ numTrans = length >= 13 * workGroupSize ? 1 : (13 * workGroupSize) / length;
450+ }
451+ else
452+ {
441453 size_t leastNumPerWI = 1 ; // least number of elements in one work item
442454 size_t maxWorkGroupSize = MAX_WGS; // maximum work group size desired
443455
@@ -470,7 +482,14 @@ namespace StockhamGenerator
470482 leastNumPerWI = 70 ; maxWorkGroupSize = 36 ;
471483 } else if (primeFactorsExpanded[3 ] * primeFactorsExpanded[5 ] * primeFactorsExpanded[7 ] == length) {
472484 leastNumPerWI =105 ; maxWorkGroupSize = 24 ;
473- } else {
485+ }
486+ else if (primeFactorsExpanded[2 ] * primeFactorsExpanded[11 ] == length) {
487+ leastNumPerWI = 22 ; maxWorkGroupSize = 128 ;
488+ }
489+ else if (primeFactorsExpanded[2 ] * primeFactorsExpanded[13 ] == length) {
490+ leastNumPerWI = 26 ; maxWorkGroupSize = 128 ;
491+ }
492+ else {
474493 leastNumPerWI =210 ; maxWorkGroupSize = 12 ;
475494 }
476495 if (pr==P_DOUBLE)
@@ -2025,7 +2044,7 @@ namespace StockhamGenerator
20252044 fft_postCallback = postcallbackParam;
20262045 }
20272046
2028- void GeneratePass ( bool fwd, std::string &passStr, bool fft_3StepTwiddle,
2047+ void GeneratePass ( bool fwd, std::string &passStr, bool fft_3StepTwiddle, bool twiddleFront,
20292048 bool inInterleaved, bool outInterleaved,
20302049 bool inReal, bool outReal,
20312050 size_t inStride, size_t outStride, double scale,
@@ -2495,7 +2514,7 @@ namespace StockhamGenerator
24952514
24962515 // 3-step twiddle multiplies done in the front
24972516 bool tw3Done = false ;
2498- if (fft_3StepTwiddle && (position == 0 ) )
2517+ if (fft_3StepTwiddle && twiddleFront )
24992518 {
25002519 tw3Done = true ;
25012520 if (linearRegs)
@@ -3019,7 +3038,7 @@ namespace StockhamGenerator
30193038 else
30203039 {
30213040 // Possible radices
3022- size_t cRad[] = {10 ,8 ,7 ,6 ,5 ,4 ,3 ,2 ,1 }; // Must be in descending order
3041+ size_t cRad[] = {13 , 11 , 10 ,8 ,7 ,6 ,5 ,4 ,3 ,2 ,1 }; // Must be in descending order
30233042 size_t cRadSize = (sizeof (cRad)/sizeof (cRad[0 ]));
30243043
30253044 // Generate the radix and pass objects
@@ -3233,32 +3252,12 @@ namespace StockhamGenerator
32333252
32343253 std::string sfx = FloatSuffix<PR>();
32353254
3255+ // Base type
3256+ str += " #define fptype " ; str += RegBaseType<PR>(1 ); str += " \n\n " ;
3257+
32363258 // Vector type
32373259 str += " #define fvect2 " ; str += RegBaseType<PR>(2 ); str += " \n\n " ;
32383260
3239- // constants
3240- str += " #define C8Q 0.70710678118654752440084436210485" ; str += sfx; str += " \n " ;
3241-
3242- str += " #define C5QA 0.30901699437494742410229341718282" ; str += sfx; str += " \n " ;
3243- str += " #define C5QB 0.95105651629515357211643933337938" ; str += sfx; str += " \n " ;
3244- str += " #define C5QC 0.50000000000000000000000000000000" ; str += sfx; str += " \n " ;
3245- str += " #define C5QD 0.58778525229247312916870595463907" ; str += sfx; str += " \n " ;
3246- str += " #define C5QE 0.80901699437494742410229341718282" ; str += sfx; str += " \n " ;
3247-
3248- str += " #define C3QA 0.50000000000000000000000000000000" ; str += sfx; str += " \n " ;
3249- str += " #define C3QB 0.86602540378443864676372317075294" ; str += sfx; str += " \n " ;
3250-
3251- str += " #define C7Q1 -1.16666666666666651863693004997913" ; str += sfx; str += " \n " ;
3252- str += " #define C7Q2 0.79015646852540022404554065360571" ; str += sfx; str += " \n " ;
3253- str += " #define C7Q3 0.05585426728964774240049351305970" ; str += sfx; str += " \n " ;
3254- str += " #define C7Q4 0.73430220123575240531721419756650" ; str += sfx; str += " \n " ;
3255- str += " #define C7Q5 0.44095855184409837868031445395900" ; str += sfx; str += " \n " ;
3256- str += " #define C7Q6 0.34087293062393136944265847887436" ; str += sfx; str += " \n " ;
3257- str += " #define C7Q7 -0.53396936033772524066165487965918" ; str += sfx; str += " \n " ;
3258- str += " #define C7Q8 0.87484229096165666561546458979137" ; str += sfx; str += " \n " ;
3259-
3260- str += " \n " ;
3261-
32623261 bool cReg = linearRegs ? true : false ;
32633262
32643263 // Generate butterflies for all unique radices
@@ -3269,6 +3268,86 @@ namespace StockhamGenerator
32693268 uradices.sort ();
32703269 uradices.unique ();
32713270
3271+
3272+ // constants
3273+ if (length%8 == 0 )
3274+ {
3275+ str += " #define C8Q 0.70710678118654752440084436210485" ; str += sfx; str += " \n " ;
3276+ }
3277+
3278+ if (length % 5 == 0 )
3279+ {
3280+ str += " #define C5QA 0.30901699437494742410229341718282" ; str += sfx; str += " \n " ;
3281+ str += " #define C5QB 0.95105651629515357211643933337938" ; str += sfx; str += " \n " ;
3282+ str += " #define C5QC 0.50000000000000000000000000000000" ; str += sfx; str += " \n " ;
3283+ str += " #define C5QD 0.58778525229247312916870595463907" ; str += sfx; str += " \n " ;
3284+ str += " #define C5QE 0.80901699437494742410229341718282" ; str += sfx; str += " \n " ;
3285+ }
3286+
3287+ if (length % 3 == 0 )
3288+ {
3289+ str += " #define C3QA 0.50000000000000000000000000000000" ; str += sfx; str += " \n " ;
3290+ str += " #define C3QB 0.86602540378443864676372317075294" ; str += sfx; str += " \n " ;
3291+ }
3292+
3293+ if (length % 7 == 0 )
3294+ {
3295+ str += " #define C7Q1 -1.16666666666666651863693004997913" ; str += sfx; str += " \n " ;
3296+ str += " #define C7Q2 0.79015646852540022404554065360571" ; str += sfx; str += " \n " ;
3297+ str += " #define C7Q3 0.05585426728964774240049351305970" ; str += sfx; str += " \n " ;
3298+ str += " #define C7Q4 0.73430220123575240531721419756650" ; str += sfx; str += " \n " ;
3299+ str += " #define C7Q5 0.44095855184409837868031445395900" ; str += sfx; str += " \n " ;
3300+ str += " #define C7Q6 0.34087293062393136944265847887436" ; str += sfx; str += " \n " ;
3301+ str += " #define C7Q7 -0.53396936033772524066165487965918" ; str += sfx; str += " \n " ;
3302+ str += " #define C7Q8 0.87484229096165666561546458979137" ; str += sfx; str += " \n " ;
3303+ }
3304+
3305+ if (length % 11 == 0 )
3306+ {
3307+ str += " #define b11_0 0.9898214418809327" ; str += sfx; str += " \n " ;
3308+ str += " #define b11_1 0.9594929736144973" ; str += sfx; str += " \n " ;
3309+ str += " #define b11_2 0.9189859472289947" ; str += sfx; str += " \n " ;
3310+ str += " #define b11_3 0.8767688310025893" ; str += sfx; str += " \n " ;
3311+ str += " #define b11_4 0.8308300260037728" ; str += sfx; str += " \n " ;
3312+ str += " #define b11_5 0.7784344533346518" ; str += sfx; str += " \n " ;
3313+ str += " #define b11_6 0.7153703234534297" ; str += sfx; str += " \n " ;
3314+ str += " #define b11_7 0.6343562706824244" ; str += sfx; str += " \n " ;
3315+ str += " #define b11_8 0.3425847256816375" ; str += sfx; str += " \n " ;
3316+ str += " #define b11_9 0.5211085581132027" ; str += sfx; str += " \n " ;
3317+ }
3318+
3319+ if (length % 13 == 0 )
3320+ {
3321+ str += " #define b13_0 0.9682872443619840" ; str += sfx; str += " \n " ;
3322+ str += " #define b13_1 0.9578059925946651" ; str += sfx; str += " \n " ;
3323+ str += " #define b13_2 0.8755023024091479" ; str += sfx; str += " \n " ;
3324+ str += " #define b13_3 0.8660254037844386" ; str += sfx; str += " \n " ;
3325+ str += " #define b13_4 0.8595425350987748" ; str += sfx; str += " \n " ;
3326+ str += " #define b13_5 0.8534800018598239" ; str += sfx; str += " \n " ;
3327+ str += " #define b13_6 0.7693388175729806" ; str += sfx; str += " \n " ;
3328+ str += " #define b13_7 0.6865583707817543" ; str += sfx; str += " \n " ;
3329+ str += " #define b13_8 0.6122646503767565" ; str += sfx; str += " \n " ;
3330+ str += " #define b13_9 0.6004772719326652" ; str += sfx; str += " \n " ;
3331+ str += " #define b13_10 0.5817047785105157" ; str += sfx; str += " \n " ;
3332+ str += " #define b13_11 0.5751407294740031" ; str += sfx; str += " \n " ;
3333+ str += " #define b13_12 0.5220263851612750" ; str += sfx; str += " \n " ;
3334+ str += " #define b13_13 0.5200285718888646" ; str += sfx; str += " \n " ;
3335+ str += " #define b13_14 0.5165207806234897" ; str += sfx; str += " \n " ;
3336+ str += " #define b13_15 0.5149187780863157" ; str += sfx; str += " \n " ;
3337+ str += " #define b13_16 0.5035370328637666" ; str += sfx; str += " \n " ;
3338+ str += " #define b13_17 0.5000000000000000" ; str += sfx; str += " \n " ;
3339+ str += " #define b13_18 0.3027756377319946" ; str += sfx; str += " \n " ;
3340+ str += " #define b13_19 0.3014792600477098" ; str += sfx; str += " \n " ;
3341+ str += " #define b13_20 0.3004626062886657" ; str += sfx; str += " \n " ;
3342+ str += " #define b13_21 0.2517685164318833" ; str += sfx; str += " \n " ;
3343+ str += " #define b13_22 0.2261094450357824" ; str += sfx; str += " \n " ;
3344+ str += " #define b13_23 0.0833333333333333" ; str += sfx; str += " \n " ;
3345+ str += " #define b13_24 0.0386329546443481" ; str += sfx; str += " \n " ;
3346+ }
3347+
3348+ str += " \n " ;
3349+
3350+
32723351 // If pre-callback is set for the plan
32733352 std::string callbackstr;
32743353 if (params.fft_hasPreCallback )
@@ -3351,7 +3430,7 @@ namespace StockhamGenerator
33513430 if ((p+1 ) != passes.end ()) { outIlvd = ldsInterleaved; }
33523431 }
33533432
3354- p->GeneratePass (fwd, str, tw3Step, inIlvd, outIlvd, inRl, outRl, ins, outs, s, gIn , gOut );
3433+ p->GeneratePass (fwd, str, tw3Step, params. fft_twiddleFront , inIlvd, outIlvd, inRl, outRl, ins, outs, s, gIn , gOut );
33553434 }
33563435
33573436 // if real transform we do only 1 direction
0 commit comments