Ejemplo n.º 1
0
 void firstpass_8_b(ffts_plan_t *p, const void *in, void *out)
{
    const data_t *din = (const data_t *)in;
    data_t *dout = (data_t *)out;
    V r0_1, r2_3, r4_5, r6_7;
    float *LUT8 = p->ws + p->ws_is[0];

    L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
    K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
    S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12);
}
Ejemplo n.º 2
0
 void firstpass_16_b(ffts_plan_t *  p, const void *  in, void *  out)
{
    const data_t *din = (const data_t *)in;
    data_t *dout = (data_t *)out;
    V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
    float *LUT8 = p->ws;

    L_4_4(1, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11);
    L_2_4(1, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13);
    K_N(1, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
    K_N(1, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13);
    S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24);
    K_N(1, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15);
    S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28);
}
Ejemplo n.º 3
0
Archivo: ffts.c Proyecto: RTsGIT/ffts
ffts_plan_t *ffts_init_1d(size_t N, int sign) {
	ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
	size_t leafN = 8;	
	size_t i;	

#ifdef __arm__
//#ifdef HAVE_NEON
	V MULI_SIGN;
	
	if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
	else         MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
//#endif 
#else
	V MULI_SIGN;
	
	if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
	else         MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
#endif

	p->transform = NULL;
	p->transform_base = NULL;
	p->transforms = NULL;
	p->is = NULL;
	p->ws_is = NULL;
	p->ws = NULL;
	p->offsets = NULL;
	p->destroy = ffts_free_1d;

	if(N >= 32) {
		ffts_init_offsets(p, N, leafN);
#ifdef __arm__
#ifdef HAVE_NEON
		ffts_init_is(p, N, leafN, 1);
#else
		ffts_init_is(p, N, leafN, 1);
#endif
#else
		ffts_init_is(p, N, leafN, 1);
#endif
		
		p->i0 = N/leafN/3+1;
		p->i1 = N/leafN/3;
		if((N/leafN) % 3 > 1) p->i1++;
		p->i2 = N/leafN/3;
		
		#ifdef __arm__	
		#ifdef HAVE_NEON
		p->i0/=2;
		p->i1/=2;
		#endif
		#else
		p->i0/=2;
		p->i1/=2;
		#endif

	}else{
		p->transforms = malloc(2 * sizeof(transform_index_t));
		p->transforms[0] = 0;
		p->transforms[1] = 1;
		if(N == 2) p->transform = &firstpass_2;
		else if(N == 4 && sign == -1) p->transform = &firstpass_4_f;
		else if(N == 4 && sign == 1) p->transform = &firstpass_4_b;
		else if(N == 8 && sign == -1) p->transform = &firstpass_8_f;
		else if(N == 8 && sign == 1) p->transform = &firstpass_8_b;
		else if(N == 16 && sign == -1) p->transform = &firstpass_16_f;
		else if(N == 16 && sign == 1) p->transform = &firstpass_16_b;

		p->is = NULL;
		p->offsets = NULL;
	}

		int hardcoded = 0;

		/*      LUTS           */
		size_t n_luts = __builtin_ctzl(N/leafN);
		if(N < 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; }

		if(n_luts >= 32) n_luts = 0;

//		fprintf(stderr, "n_luts = %zu\n", n_luts);
		
		cdata_t *w;

		int n = leafN*2;
		if(hardcoded) n = 8;
		
		size_t lut_size = 0;

		for(i=0;i<n_luts;i++) {
			if(!i || hardcoded) {
			#ifdef __arm__ 
				if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t);
				else lut_size += n/4 * sizeof(cdata_t);
			#else
				lut_size += n/4 * 2 * sizeof(cdata_t);
			#endif
				n *= 2;
			} else {
			#ifdef __arm__
				lut_size += n/8 * 3 * sizeof(cdata_t);
			#else
				lut_size += n/8 * 3 * 2 * sizeof(cdata_t);
			#endif
			}
			n *= 2;
		}
		
//		lut_size *= 16;
		
	//	fprintf(stderr, "lut size = %zu\n", lut_size);
		if(n_luts) {
			p->ws = FFTS_MALLOC(lut_size,32);
			p->ws_is = malloc(n_luts * sizeof(size_t));
		}else{
			p->ws = NULL;
			p->ws_is = NULL;
		}
		w = p->ws;

		n = leafN*2;
		if(hardcoded) n = 8;
		
		#ifdef HAVE_NEON
			V neg = (sign < 0) ? VLIT4(0.0f, 0.0f, 0.0f, 0.0f) : VLIT4(-0.0f, -0.0f, -0.0f, -0.0f);
		#endif
		
		for(i=0;i<n_luts;i++) {
			p->ws_is[i] = w - (cdata_t *)p->ws;	
			//fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]);	
			
			if(!i || hardcoded) {
				cdata_t *w0 = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);

				size_t j;
				for(j=0;j<n/4;j++) {
					w0[j][0]	= W_re(n,j);
					w0[j][1]	= W_im(n,j);
				}


				float *fw0 = (float *)w0;
				#ifdef __arm__
					if(N < 32) {
						//w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
						float *fw = (float *)w;
						V temp0, temp1, temp2;
						for(j=0;j<n/4;j+=2) {
						//	#ifdef HAVE_NEON
							temp0 = VLD(fw0 + j*2);
							V re, im;
							re = VDUPRE(temp0);
							im = VDUPIM(temp0);
							#ifdef HAVE_NEON 
								im = VXOR(im, MULI_SIGN);
								//im = IMULI(sign>0, im);
							#else
								im = MULI(sign>0, im);
							#endif
							VST(fw + j*4  , re);
							VST(fw + j*4+4, im);
					//		#endif
						}
						w += n/4 * 2;
					}else{
						//w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
						float *fw = (float *)w;
						#ifdef HAVE_NEON
							VS temp0, temp1, temp2;
							for(j=0;j<n/4;j+=4) {
								temp0 = VLD2(fw0 + j*2);
								temp0.val[1] = VXOR(temp0.val[1], neg);
								STORESPR(fw + j*2, temp0);
							}
						#else
							for(j=0;j<n/4;j+=1) {
								fw[j*2] = fw0[j*2];
								fw[j*2+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
							}
						#endif
						w += n/4;
					}
				#else
					//w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
					float *fw = (float *)w;
					V temp0, temp1, temp2;
					for(j=0;j<n/4;j+=2) {
						temp0 = VLD(fw0 + j*2);
						V re, im;
						re = VDUPRE(temp0);
						im = VDUPIM(temp0);
						im = VXOR(im, MULI_SIGN);
						VST(fw + j*4  , re);
						VST(fw + j*4+4, im);
					}
					w += n/4 * 2;
				#endif

				FFTS_FREE(w0);
			}else{

				cdata_t *w0 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
				cdata_t *w1 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
				cdata_t *w2 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);

				size_t j;
				for(j=0;j<n/8;j++) {
					w0[j][0]	= W_re(n,j*2);
					w0[j][1]	= W_im(n,j*2);
					w1[j][0]	= W_re(n,j);
					w1[j][1]	= W_im(n,j);
					w2[j][0]	= W_re(n,j + (n/8));
					w2[j][1]	= W_im(n,j + (n/8));

				}

				float *fw0 = (float *)w0;
				float *fw1 = (float *)w1;
				float *fw2 = (float *)w2;
				#ifdef __arm__
					//w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32);
					float *fw = (float *)w;
					#ifdef HAVE_NEON	
						VS temp0, temp1, temp2;
						for(j=0;j<n/8;j+=4) {
							temp0 = VLD2(fw0 + j*2);
							temp0.val[1] = VXOR(temp0.val[1], neg);
							STORESPR(fw + j*2*3,      temp0);
							temp1 = VLD2(fw1 + j*2);
							temp1.val[1] = VXOR(temp1.val[1], neg);
							STORESPR(fw + j*2*3 + 8,  temp1);
							temp2 = VLD2(fw2 + j*2);
							temp2.val[1] = VXOR(temp2.val[1], neg);
							STORESPR(fw + j*2*3 + 16, temp2);
						}
					#else
						for(j=0;j<n/8;j+=1) {
								fw[j*6] = fw0[j*2];
								fw[j*6+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
								fw[j*6+2] = fw1[j*2+0];
								fw[j*6+3] = (sign < 0) ? fw1[j*2+1] : -fw1[j*2+1];
								fw[j*6+4] = fw2[j*2+0];
								fw[j*6+5] = (sign < 0) ? fw2[j*2+1] : -fw2[j*2+1];
						}
					#endif
					w += n/8 * 3;
				#else
					//w = FFTS_MALLOC(n/8 * 3 * 2 * sizeof(cdata_t), 32);
					float *fw = (float *)w;
					V temp0, temp1, temp2, re, im;
					for(j=0;j<n/8;j+=2) {
						temp0 = VLD(fw0 + j*2);
						re = VDUPRE(temp0);
						im = VDUPIM(temp0);
						im = VXOR(im, MULI_SIGN);
						VST(fw + j*2*6  , re);
						VST(fw + j*2*6+4, im);

						temp1 = VLD(fw1 + j*2);
						re = VDUPRE(temp1);
						im = VDUPIM(temp1);
						im = VXOR(im, MULI_SIGN);
						VST(fw + j*2*6+8 , re);
						VST(fw + j*2*6+12, im);

						temp2 = VLD(fw2 + j*2);
						re = VDUPRE(temp2);
						im = VDUPIM(temp2);
						im = VXOR(im, MULI_SIGN);
						VST(fw + j*2*6+16, re);
						VST(fw + j*2*6+20, im);
					}
					w += n/8 * 3 * 2;
				#endif

				FFTS_FREE(w0);
				FFTS_FREE(w1);
				FFTS_FREE(w2);
			}
			///p->ws[i] = w;

			n *= 2;
		}

	float *tmp = (float *)p->ws;

	if(sign < 0) {
		p->oe_ws = (void *)(&w_data[4]);
		p->ee_ws = (void *)(w_data);
		p->eo_ws = (void *)(&w_data[4]);
	}else{
		p->oe_ws = (void *)(w_data + 12);
		p->ee_ws = (void *)(w_data + 8);
		p->eo_ws = (void *)(w_data + 12);
	}

	p->N = N;
	p->lastlut = w;
	p->n_luts = n_luts;
#ifdef DYNAMIC_DISABLED
	if(sign < 0) { 
		if(N >= 32) p->transform = ffts_static_transform_f; 
	}else{
		if(N >= 32) p->transform = ffts_static_transform_i; 
	}

#else
	if(N>=32)  ffts_generate_func_code(p, N, leafN, sign);
#endif

	return p;
}