void firstpass_8_b(ffts_plan_t *p, const void *in, void *out) { const data_t *din = (const data_t *)in; data_t *dout = (data_t *)out; V r0_1, r2_3, r4_5, r6_7; float *LUT8 = p->ws + p->ws_is[0]; L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7); K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7); S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12); }
void firstpass_16_b(ffts_plan_t * p, const void * in, void * out) { const data_t *din = (const data_t *)in; data_t *dout = (data_t *)out; V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15; float *LUT8 = p->ws; L_4_4(1, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11); L_2_4(1, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13); K_N(1, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7); K_N(1, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13); S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24); K_N(1, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15); S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28); }
ffts_plan_t *ffts_init_1d(size_t N, int sign) { ffts_plan_t *p = malloc(sizeof(ffts_plan_t)); size_t leafN = 8; size_t i; #ifdef __arm__ //#ifdef HAVE_NEON V MULI_SIGN; if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f); else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f); //#endif #else V MULI_SIGN; if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f); else MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f); #endif p->transform = NULL; p->transform_base = NULL; p->transforms = NULL; p->is = NULL; p->ws_is = NULL; p->ws = NULL; p->offsets = NULL; p->destroy = ffts_free_1d; if(N >= 32) { ffts_init_offsets(p, N, leafN); #ifdef __arm__ #ifdef HAVE_NEON ffts_init_is(p, N, leafN, 1); #else ffts_init_is(p, N, leafN, 1); #endif #else ffts_init_is(p, N, leafN, 1); #endif p->i0 = N/leafN/3+1; p->i1 = N/leafN/3; if((N/leafN) % 3 > 1) p->i1++; p->i2 = N/leafN/3; #ifdef __arm__ #ifdef HAVE_NEON p->i0/=2; p->i1/=2; #endif #else p->i0/=2; p->i1/=2; #endif }else{ p->transforms = malloc(2 * sizeof(transform_index_t)); p->transforms[0] = 0; p->transforms[1] = 1; if(N == 2) p->transform = &firstpass_2; else if(N == 4 && sign == -1) p->transform = &firstpass_4_f; else if(N == 4 && sign == 1) p->transform = &firstpass_4_b; else if(N == 8 && sign == -1) p->transform = &firstpass_8_f; else if(N == 8 && sign == 1) p->transform = &firstpass_8_b; else if(N == 16 && sign == -1) p->transform = &firstpass_16_f; else if(N == 16 && sign == 1) p->transform = &firstpass_16_b; p->is = NULL; p->offsets = NULL; } int hardcoded = 0; /* LUTS */ size_t n_luts = __builtin_ctzl(N/leafN); if(N < 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; } if(n_luts >= 32) n_luts = 0; // fprintf(stderr, "n_luts = %zu\n", n_luts); cdata_t *w; int n = leafN*2; if(hardcoded) n = 8; size_t lut_size = 0; for(i=0;i<n_luts;i++) { if(!i || hardcoded) { #ifdef __arm__ if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t); else lut_size += n/4 * sizeof(cdata_t); #else lut_size += n/4 * 2 * sizeof(cdata_t); #endif n *= 2; } else { #ifdef __arm__ lut_size += n/8 * 3 * sizeof(cdata_t); #else lut_size += n/8 * 3 * 2 * sizeof(cdata_t); #endif } n *= 2; } // lut_size *= 16; // fprintf(stderr, "lut size = %zu\n", lut_size); if(n_luts) { p->ws = FFTS_MALLOC(lut_size,32); p->ws_is = malloc(n_luts * sizeof(size_t)); }else{ p->ws = NULL; p->ws_is = NULL; } w = p->ws; n = leafN*2; if(hardcoded) n = 8; #ifdef HAVE_NEON V neg = (sign < 0) ? VLIT4(0.0f, 0.0f, 0.0f, 0.0f) : VLIT4(-0.0f, -0.0f, -0.0f, -0.0f); #endif for(i=0;i<n_luts;i++) { p->ws_is[i] = w - (cdata_t *)p->ws; //fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]); if(!i || hardcoded) { cdata_t *w0 = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32); size_t j; for(j=0;j<n/4;j++) { w0[j][0] = W_re(n,j); w0[j][1] = W_im(n,j); } float *fw0 = (float *)w0; #ifdef __arm__ if(N < 32) { //w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32); float *fw = (float *)w; V temp0, temp1, temp2; for(j=0;j<n/4;j+=2) { // #ifdef HAVE_NEON temp0 = VLD(fw0 + j*2); V re, im; re = VDUPRE(temp0); im = VDUPIM(temp0); #ifdef HAVE_NEON im = VXOR(im, MULI_SIGN); //im = IMULI(sign>0, im); #else im = MULI(sign>0, im); #endif VST(fw + j*4 , re); VST(fw + j*4+4, im); // #endif } w += n/4 * 2; }else{ //w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32); float *fw = (float *)w; #ifdef HAVE_NEON VS temp0, temp1, temp2; for(j=0;j<n/4;j+=4) { temp0 = VLD2(fw0 + j*2); temp0.val[1] = VXOR(temp0.val[1], neg); STORESPR(fw + j*2, temp0); } #else for(j=0;j<n/4;j+=1) { fw[j*2] = fw0[j*2]; fw[j*2+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1]; } #endif w += n/4; } #else //w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32); float *fw = (float *)w; V temp0, temp1, temp2; for(j=0;j<n/4;j+=2) { temp0 = VLD(fw0 + j*2); V re, im; re = VDUPRE(temp0); im = VDUPIM(temp0); im = VXOR(im, MULI_SIGN); VST(fw + j*4 , re); VST(fw + j*4+4, im); } w += n/4 * 2; #endif FFTS_FREE(w0); }else{ cdata_t *w0 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32); cdata_t *w1 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32); cdata_t *w2 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32); size_t j; for(j=0;j<n/8;j++) { w0[j][0] = W_re(n,j*2); w0[j][1] = W_im(n,j*2); w1[j][0] = W_re(n,j); w1[j][1] = W_im(n,j); w2[j][0] = W_re(n,j + (n/8)); w2[j][1] = W_im(n,j + (n/8)); } float *fw0 = (float *)w0; float *fw1 = (float *)w1; float *fw2 = (float *)w2; #ifdef __arm__ //w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32); float *fw = (float *)w; #ifdef HAVE_NEON VS temp0, temp1, temp2; for(j=0;j<n/8;j+=4) { temp0 = VLD2(fw0 + j*2); temp0.val[1] = VXOR(temp0.val[1], neg); STORESPR(fw + j*2*3, temp0); temp1 = VLD2(fw1 + j*2); temp1.val[1] = VXOR(temp1.val[1], neg); STORESPR(fw + j*2*3 + 8, temp1); temp2 = VLD2(fw2 + j*2); temp2.val[1] = VXOR(temp2.val[1], neg); STORESPR(fw + j*2*3 + 16, temp2); } #else for(j=0;j<n/8;j+=1) { fw[j*6] = fw0[j*2]; fw[j*6+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1]; fw[j*6+2] = fw1[j*2+0]; fw[j*6+3] = (sign < 0) ? fw1[j*2+1] : -fw1[j*2+1]; fw[j*6+4] = fw2[j*2+0]; fw[j*6+5] = (sign < 0) ? fw2[j*2+1] : -fw2[j*2+1]; } #endif w += n/8 * 3; #else //w = FFTS_MALLOC(n/8 * 3 * 2 * sizeof(cdata_t), 32); float *fw = (float *)w; V temp0, temp1, temp2, re, im; for(j=0;j<n/8;j+=2) { temp0 = VLD(fw0 + j*2); re = VDUPRE(temp0); im = VDUPIM(temp0); im = VXOR(im, MULI_SIGN); VST(fw + j*2*6 , re); VST(fw + j*2*6+4, im); temp1 = VLD(fw1 + j*2); re = VDUPRE(temp1); im = VDUPIM(temp1); im = VXOR(im, MULI_SIGN); VST(fw + j*2*6+8 , re); VST(fw + j*2*6+12, im); temp2 = VLD(fw2 + j*2); re = VDUPRE(temp2); im = VDUPIM(temp2); im = VXOR(im, MULI_SIGN); VST(fw + j*2*6+16, re); VST(fw + j*2*6+20, im); } w += n/8 * 3 * 2; #endif FFTS_FREE(w0); FFTS_FREE(w1); FFTS_FREE(w2); } ///p->ws[i] = w; n *= 2; } float *tmp = (float *)p->ws; if(sign < 0) { p->oe_ws = (void *)(&w_data[4]); p->ee_ws = (void *)(w_data); p->eo_ws = (void *)(&w_data[4]); }else{ p->oe_ws = (void *)(w_data + 12); p->ee_ws = (void *)(w_data + 8); p->eo_ws = (void *)(w_data + 12); } p->N = N; p->lastlut = w; p->n_luts = n_luts; #ifdef DYNAMIC_DISABLED if(sign < 0) { if(N >= 32) p->transform = ffts_static_transform_f; }else{ if(N >= 32) p->transform = ffts_static_transform_i; } #else if(N>=32) ffts_generate_func_code(p, N, leafN, sign); #endif return p; }