示例#1
0
文件: codegen.c 项目: matszpk/ffts
void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
	int count = tree_count(N, leafN, 0) + 1;
	size_t *ps = malloc(count * 2 * sizeof(size_t));
	size_t *pps = ps;

#ifdef __x86_64__
	if(sign < 0) p->constants = sse_constants;
	else         p->constants = sse_constants_inv;
#endif

	elaborate_tree(&pps, N, leafN, 0);
	pps[0] = 0;
	pps[1] = 0;

	pps = ps;

#ifdef __arm__ 
	if(N < 8192) p->transform_size = 8192;
	else p->transform_size = N;
#else
	if(N < 2048) p->transform_size = 16384;
	else p->transform_size = 16384 + 2*N/8 * __builtin_ctzl(N);
#endif

#ifdef __APPLE__
	p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANON | MAP_SHARED, -1, 0);
#else
#define MAP_ANONYMOUS 0x20	
	p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
#endif

/*
	if(p->transform_base == MAP_FAILED) {
		fprintf(stderr, "MAP FAILED\n");
		exit(1);
	}*/
	insns_t *func = p->transform_base;//valloc(8192);
	insns_t *fp = func;

//fprintf(stderr, "Allocating %d bytes \n", p->transform_size);
//fprintf(stderr, "Base address = %016p\n", func);

	if(!func) { 
		fprintf(stderr, "NOMEM\n");
		exit(1);
	}

	insns_t *x_8_addr = fp;
#ifdef __arm__
#ifdef HAVE_NEON
	memcpy(fp, neon_x8, neon_x8_t - neon_x8);
	if(sign < 0) {
		fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
		fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000;
		fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000;
	}
	fp += (neon_x8_t - neon_x8) / 4;
#else
	memcpy(fp, vfp_x8, vfp_end - vfp_x8);
	if(sign > 0) {
		fp[65] ^= 0x00000040; 
		fp[66] ^= 0x00000040; 
		fp[68] ^= 0x00000040; 
		fp[70] ^= 0x00000040; 
  	fp[103] ^= 0x00000040; 
  	fp[104] ^= 0x00000040; 
  	fp[105] ^= 0x00000040; 
  	fp[108] ^= 0x00000040; 
  	fp[113] ^= 0x00000040; 
  	fp[114] ^= 0x00000040; 
  	fp[117] ^= 0x00000040; 
  	fp[118] ^= 0x00000040; 
	}
	fp += (vfp_end - vfp_x8) / 4;
#endif
#else
	align_mem16(&fp, 0);
	x_8_addr = fp;
	align_mem16(&fp, 5);
	memcpy(fp, x8_soft, x8_hard - x8_soft);
	fp += (x8_hard - x8_soft);
//fprintf(stderr, "X8 start address = %016p\n", x_8_addr);
#endif
//uint32_t *x_8_t_addr = fp;
//memcpy(fp, neon_x8_t, neon_end - neon_x8_t);
//fp += (neon_end - neon_x8_t) / 4;
	insns_t *x_4_addr = fp;
#ifdef __arm__

#ifdef HAVE_NEON
	memcpy(fp, neon_x4, neon_x8 - neon_x4);
	if(sign < 0) {
		fp[26] ^= 0x00200000; fp[28] ^= 0x00200000; fp[31] ^= 0x00200000; fp[32] ^= 0x00200000;
	}
	fp += (neon_x8 - neon_x4) / 4;
#else
	memcpy(fp, vfp_x4, vfp_x8 - vfp_x4);
	if(sign > 0) {
		fp[36] ^= 0x00000040; 
		fp[38] ^= 0x00000040; 
		fp[43] ^= 0x00000040; 
		fp[44] ^= 0x00000040;
	}
	fp += (vfp_x8 - vfp_x4) / 4;
#endif
#else
	align_mem16(&fp, 0);
	x_4_addr = fp;
	memcpy(fp, x4, x8_soft - x4);
	fp += (x8_soft - x4);

#endif
	insns_t *start = fp;

#ifdef __arm__ 
	*fp = PUSH_LR(); fp++;
	*fp = 0xed2d8b10; fp++;

	ADDI(&fp, 3, 1, 0);
	ADDI(&fp, 7, 1, N);
	ADDI(&fp, 5, 1, 2*N);
	ADDI(&fp, 10, 7, 2*N);
	ADDI(&fp, 4, 5, 2*N);
	ADDI(&fp, 8, 10, 2*N);
	ADDI(&fp, 6, 4, 2*N);
	ADDI(&fp, 9, 8, 2*N);
	
  *fp = LDRI(12, 0, ((uint32_t)&p->offsets) - ((uint32_t)p)); fp++; // load offsets into r12
//  *fp++ = LDRI(1, 0, 4); // load ws into r1
	ADDI(&fp, 1, 0, 0);

	ADDI(&fp, 0, 2, 0), // mov out into r0
#endif


#ifdef __arm__
		*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; 
  	#ifdef HAVE_NEON
  		MOVI(&fp, 11, p->i0);
  	#else 
  		MOVI(&fp, 11, p->i0);
  	#endif

#else
	align_mem16(&fp, 0);
	start = fp;
	
	*fp++ = 0x4c;
	*fp++ = 0x8b;
	*fp++ = 0x07;
	uint32_t lp_cnt = p->i0 * 4;
	MOVI(&fp, RCX, lp_cnt);
	
	//LEA(&fp, R8, RDI, ((uint32_t)&p->offsets) - ((uint32_t)p)); 
#endif
	//fp++;
#ifdef __arm__
#ifdef HAVE_NEON
	memcpy(fp, neon_ee, neon_oo - neon_ee);
	if(sign < 0) {
		fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
		fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
		fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
	}
	fp += (neon_oo - neon_ee) / 4;
#else
		memcpy(fp, vfp_e, vfp_o - vfp_e);
		if(sign > 0) {
			fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040;
			fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040;
			fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040;
		}
		fp += (vfp_o - vfp_e) / 4;
#endif
#else
//fprintf(stderr, "Body start address = %016p\n", start);
	
	PUSH(&fp, RBP);	
	PUSH(&fp, RBX);
	PUSH(&fp, R10);
	PUSH(&fp, R11);
	PUSH(&fp, R12);
	PUSH(&fp, R13);
	PUSH(&fp, R14);
	PUSH(&fp, R15);
	
	int i;
	memcpy(fp, leaf_ee_init, leaf_ee - leaf_ee_init);
	
//fprintf(stderr, "Leaf ee init address = %016p\n", leaf_ee_init);
//fprintf(stderr, "Constants address = %016p\n", sse_constants);
//fprintf(stderr, "Constants address = %016p\n", p->constants);
	
//int32_t val = READ_IMM32(fp + 3);
//fprintf(stderr, "diff = 0x%x\n", ((uint32_t)&p->constants) - ((uint32_t)p));

//int64_t v2 = val + (int64_t)((void *)leaf_ee_init - (void *)fp );
//fprintf(stderr, "IMM = 0x%llx\n", v2);

//IMM32_NI(fp + 3, ((int64_t) READ_IMM32(fp + 3)) + ((void *)leaf_ee_init - (void *)fp )); 
	fp += (leaf_ee - leaf_ee_init);

//fprintf(stderr, "Leaf start address = %016p\n", fp);
	align_mem16(&fp, 9);
	memcpy(fp, leaf_ee, leaf_oo - leaf_ee);


	uint32_t offsets[8] = {0, N, N/2, 3*N/2, N/4, 5*N/4, 7*N/4, 3*N/4};
	uint32_t offsets_o[8] = {0, N, N/2, 3*N/2, 7*N/4, 3*N/4, N/4, 5*N/4};
	uint32_t offsets_oe[8] = {7*N/4, 3*N/4, N/4, 5*N/4, 0, N, 3*N/2, N/2};
	
	for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets[i]*4); 
	
	fp += (leaf_oo - leaf_ee);
	
	if(__builtin_ctzl(N) & 1){
		if(p->i1) {
			lp_cnt += p->i1 * 4;
  		MOVI(&fp, RCX, lp_cnt);
			align_mem16(&fp, 4);
  		memcpy(fp, leaf_oo, leaf_eo - leaf_oo);
  		for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4); 
  		fp += (leaf_eo - leaf_oo);
		}
		

  	memcpy(fp, leaf_oe, leaf_end - leaf_oe);
  	lp_cnt += 4;
  	for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oe_offsets[i], offsets_o[i]*4); 
  	fp += (leaf_end - leaf_oe);

	}else{


		memcpy(fp, leaf_eo, leaf_oe - leaf_eo);
		lp_cnt += 4;
		for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_eo_offsets[i], offsets[i]*4); 
		fp += (leaf_oe - leaf_eo);

  	if(p->i1) {
			lp_cnt += p->i1 * 4;
  		MOVI(&fp, RCX, lp_cnt);
			align_mem16(&fp, 4);
  		memcpy(fp, leaf_oo, leaf_eo - leaf_oo);
  		for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4); 
  		fp += (leaf_eo - leaf_oo);
  	}

	}
	if(p->i1) {

		lp_cnt += p->i1 * 4;
		MOVI(&fp, RCX, lp_cnt);
		align_mem16(&fp, 9);
		memcpy(fp, leaf_ee, leaf_oo - leaf_ee);
		for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets_oe[i]*4); 
		fp += (leaf_oo - leaf_ee);

	}
	
//fprintf(stderr, "Body start address = %016p\n", fp);
  //LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p)); 
	memcpy(fp, x_init, x4 - x_init);
//IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp )); 
	fp += (x4 - x_init);

	int32_t pAddr = 0;
	int32_t pN = 0;
	int32_t pLUT = 0;
	count = 2;
	while(pps[0]) {
	
		if(!pN) {
			MOVI(&fp, RCX, pps[0] / 4);
		}else{
  		if((pps[1]*4)-pAddr) ADDI(&fp, RDX, (pps[1] * 4)- pAddr);
			if(pps[0] > leafN && pps[0] - pN) {
				
				int diff = __builtin_ctzl(pps[0]) - __builtin_ctzl(pN);
				*fp++ = 0xc1; 
				
				if(diff > 0) {
					*fp++ = 0xe1;
					*fp++ = (diff & 0xff);
				}else{
					*fp++ = 0xe9;
					*fp++ = ((-diff) & 0xff);
				}
			}
		}
		
  		if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
  			ADDI(&fp, R8, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT); 


  	if(pps[0] == 2*leafN) {
      CALL(&fp, x_4_addr);
//  	}else if(!pps[2]){
//	  //uint32_t *x_8_t_addr = fp;
//		memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
//		fp += (neon_ee - neon_x8_t) / 4;
//		//*fp++ = BL(fp+2, x_8_t_addr);
  	}else{
    		CALL(&fp, x_8_addr);
  	}

		pAddr = pps[1] * 4;
		if(pps[0] > leafN) 
			pN = pps[0];
		pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
//	fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); 
		count += 4;
		pps += 2;
	}
#endif
#ifdef __arm__
#ifdef HAVE_NEON
	if(__builtin_ctzl(N) & 1){
		ADDI(&fp, 2, 7, 0);
		ADDI(&fp, 7, 9, 0);
		ADDI(&fp, 9, 2, 0);

		ADDI(&fp, 2, 8, 0);
		ADDI(&fp, 8, 10, 0);
		ADDI(&fp, 10, 2, 0);
	
		if(p->i1) {
			MOVI(&fp, 11, p->i1);
			memcpy(fp, neon_oo, neon_eo - neon_oo);
			if(sign < 0) {
				fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000;
				fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000;
				fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
			}
			fp += (neon_eo - neon_oo) / 4;
		}
		
		*fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p)); fp++; 

		memcpy(fp, neon_oe, neon_end - neon_oe);
		if(sign < 0) {
			fp[19] ^= 0x00200000; fp[20] ^= 0x00200000; fp[22] ^= 0x00200000; fp[23] ^= 0x00200000;
			fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[40] ^= 0x00200000; fp[41] ^= 0x00200000;
			fp[64] ^= 0x00200000; fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[67] ^= 0x00200000;
		}
		fp += (neon_end - neon_oe) / 4;

	}else{
		
		*fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p)); fp++; 

		memcpy(fp, neon_eo, neon_oe - neon_eo);
		if(sign < 0) {
			fp[10] ^= 0x00200000; fp[11] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000;
			fp[31] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; fp[35] ^= 0x00200000;
			fp[59] ^= 0x00200000; fp[60] ^= 0x00200000; fp[61] ^= 0x00200000; fp[62] ^= 0x00200000;
		}
		fp += (neon_oe - neon_eo) / 4;
		
		ADDI(&fp, 2, 7, 0);
		ADDI(&fp, 7, 9, 0);
		ADDI(&fp, 9, 2, 0);

		ADDI(&fp, 2, 8, 0);
		ADDI(&fp, 8, 10, 0);
		ADDI(&fp, 10, 2, 0);
	
		if(p->i1) {
			MOVI(&fp, 11, p->i1);
			memcpy(fp, neon_oo, neon_eo - neon_oo);
			if(sign < 0) {
				fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000;
				fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000;
				fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
			}
			fp += (neon_eo - neon_oo) / 4;
		}

	}


	if(p->i1) {
		ADDI(&fp, 2, 3, 0);
		ADDI(&fp, 3, 7, 0);
		ADDI(&fp, 7, 2, 0);

		ADDI(&fp, 2, 4, 0);
		ADDI(&fp, 4, 8, 0);
		ADDI(&fp, 8, 2, 0);
		
		ADDI(&fp, 2, 5, 0);
		ADDI(&fp, 5, 9, 0);
		ADDI(&fp, 9, 2, 0);

		ADDI(&fp, 2, 6, 0);
		ADDI(&fp, 6, 10, 0);
		ADDI(&fp, 10, 2, 0);

		ADDI(&fp, 2, 9, 0);
		ADDI(&fp, 9, 10, 0);
		ADDI(&fp, 10, 2, 0);

		*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; 
	  MOVI(&fp, 11, p->i1);
  	memcpy(fp, neon_ee, neon_oo - neon_ee);
  	if(sign < 0) {
  		fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
  		fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
  		fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
  	}
		fp += (neon_oo - neon_ee) / 4;

	}
#else
		ADDI(&fp, 2, 7, 0);
		ADDI(&fp, 7, 9, 0);
		ADDI(&fp, 9, 2, 0);

		ADDI(&fp, 2, 8, 0);
		ADDI(&fp, 8, 10, 0);
		ADDI(&fp, 10, 2, 0);
	
			MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1);
  		memcpy(fp, vfp_o, vfp_x4 - vfp_o);
		if(sign > 0) {
			fp[22] ^= 0x00000040; fp[24] ^= 0x00000040; fp[25] ^= 0x00000040; fp[26] ^= 0x00000040;
			fp[62] ^= 0x00000040; fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[66] ^= 0x00000040;
		}
  		fp += (vfp_x4 - vfp_o) / 4;
		
		ADDI(&fp, 2, 3, 0);
		ADDI(&fp, 3, 7, 0);
		ADDI(&fp, 7, 2, 0);

		ADDI(&fp, 2, 4, 0);
		ADDI(&fp, 4, 8, 0);
		ADDI(&fp, 8, 2, 0);
		
		ADDI(&fp, 2, 5, 0);
		ADDI(&fp, 5, 9, 0);
		ADDI(&fp, 9, 2, 0);

		ADDI(&fp, 2, 6, 0);
		ADDI(&fp, 6, 10, 0);
		ADDI(&fp, 10, 2, 0);

		ADDI(&fp, 2, 9, 0);
		ADDI(&fp, 9, 10, 0);
		ADDI(&fp, 10, 2, 0);

		*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; 
	  MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1);
  	memcpy(fp, vfp_e, vfp_o - vfp_e);
		if(sign > 0) {
			fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040;
			fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040;
			fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040;
		}
  	fp += (vfp_o - vfp_e) / 4;

#endif
  *fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); fp++; // load offsets into r12
	//ADDI(&fp, 2, 1, 0);
	MOVI(&fp, 1, 0);
	
	// args: r0 - out
	//       r1 - N
	//       r2 - ws
//	ADDI(&fp, 3, 1, 0); // put N into r3 for counter

	int32_t pAddr = 0;
	int32_t pN = 0;
	int32_t pLUT = 0;
	count = 2;
	while(pps[0]) {
	
//	fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr);
		if(!pN) {
			MOVI(&fp, 1, pps[0]);
		}else{
  		if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr);
			if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN);
		}
		
		if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
			ADDI(&fp, 2, 2, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT); 


  	if(pps[0] == 2*leafN) {
        *fp = BL(fp+2, x_4_addr); fp++;
  	}else if(!pps[2]){
  	  //uint32_t *x_8_t_addr = fp;
#ifdef HAVE_NEON
  		memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
  		if(sign < 0) {
  			fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
  			fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000;
   			fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000;
  		}
  		fp += (neon_ee - neon_x8_t) / 4;
  		//*fp++ = BL(fp+2, x_8_t_addr);

#else
    		*fp = BL(fp+2, x_8_addr); fp++;
#endif
  	}else{
    		*fp = BL(fp+2, x_8_addr); fp++;
  	}

		pAddr = pps[1] * 4;
		pN = pps[0];
		pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
//	fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); 
		count += 4;
		pps += 2;
	}
	
	*fp++ = 0xecbd8b10;
	*fp++ = POP_LR(); count++;
#else
	POP(&fp, R15);
	POP(&fp, R14);
	POP(&fp, R13);
	POP(&fp, R12);
	POP(&fp, R11);
	POP(&fp, R10);
	POP(&fp, RBX);
	POP(&fp, RBP);
	RET(&fp);	


//uint8_t *pp = func;
//int counter = 0;
//do{ 
//	printf("%02x ", *pp);
//	if(counter++ % 16 == 15) printf("\n");
//} while(++pp < fp);

//printf("\n");


#endif


//	*fp++ = B(14); count++;

//for(int i=0;i<(neon_x8 - neon_x4)/4;i++) 
//	fprintf(stderr, "%08x\n", x_4_addr[i]);
//fprintf(stderr, "\n");
//for(int i=0;i<count;i++) 

	free(ps);
	
  if (mprotect(func, p->transform_size, PROT_READ | PROT_EXEC)) {
  	perror("Couldn't mprotect");
  	exit(1);
  }
#ifdef __APPLE__
	sys_icache_invalidate(func, p->transform_size);
#elif __ANDROID__
	cacheflush((long)(func), (long)(func) + p->transform_size, 0);
#elif __linux__
#ifdef __GNUC__
	__clear_cache((long)(func), (long)(func) + p->transform_size);
#endif
#endif

//fprintf(stderr, "size of transform %zu = %d\n", N, (fp-func)*4);

	p->transform = (void *) (start);
}
示例#2
0
文件: codegen.c 项目: anthonix/ffts
transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
{
    uint32_t offsets[8] = {0, 4*N, 2*N, 6*N, N, 5*N, 7*N, 3*N};
    uint32_t offsets_o[8] = {0, 4*N, 2*N, 6*N, 7*N, 3*N, N, 5*N};

    int32_t pAddr = 0;
    int32_t pN = 0;
    int32_t pLUT = 0;

    insns_t  *fp;
    insns_t  *start;
    insns_t  *x_4_addr;
    insns_t  *x_8_addr;
    uint32_t  loop_count;

    int       count;
    ptrdiff_t len;

    size_t   *ps;
    size_t   *pps;

    count = ffts_tree_count(N, leaf_N, 0) + 1;

    ps = pps = malloc(2 * count * sizeof(*ps));
    if (!ps) {
        return NULL;
    }

    ffts_elaborate_tree(&pps, N, leaf_N, 0);

    pps[0] = 0;
    pps[1] = 0;

    pps = ps;

#ifdef HAVE_SSE
    if (sign < 0) {
        p->constants = (const void*) sse_constants;
    } else {
        p->constants = (const void*) sse_constants_inv;
    }
#endif

    fp = (insns_t*) p->transform_base;

    /* generate base cases */
    x_4_addr = generate_size4_base_case(&fp, sign);
    x_8_addr = generate_size8_base_case(&fp, sign);

#ifdef __arm__
    start = generate_prologue(&fp, p);

#ifdef HAVE_NEON
    memcpy(fp, neon_ee, neon_oo - neon_ee);
    if (sign < 0) {
        fp[33] ^= 0x00200000;
        fp[37] ^= 0x00200000;
        fp[38] ^= 0x00200000;
        fp[39] ^= 0x00200000;
        fp[40] ^= 0x00200000;
        fp[41] ^= 0x00200000;
        fp[44] ^= 0x00200000;
        fp[45] ^= 0x00200000;
        fp[46] ^= 0x00200000;
        fp[47] ^= 0x00200000;
        fp[48] ^= 0x00200000;
        fp[57] ^= 0x00200000;
    }

    fp += (neon_oo - neon_ee) / 4;
#else
    memcpy(fp, vfp_e, vfp_o - vfp_e);

    if (sign > 0) {
        fp[64] ^= 0x00000040;
        fp[65] ^= 0x00000040;
        fp[68] ^= 0x00000040;
        fp[75] ^= 0x00000040;
        fp[76] ^= 0x00000040;
        fp[79] ^= 0x00000040;
        fp[80] ^= 0x00000040;
        fp[83] ^= 0x00000040;
        fp[84] ^= 0x00000040;
        fp[87] ^= 0x00000040;
        fp[91] ^= 0x00000040;
        fp[93] ^= 0x00000040;
    }
    fp += (vfp_o - vfp_e) / 4;
#endif
#else
    /* generate functions */
    start = generate_prologue(&fp, p);

    loop_count = 4 * p->i0;
    generate_leaf_init(&fp, loop_count);

    if (ffts_ctzl(N) & 1) {
        generate_leaf_ee(&fp, offsets, p->i1 ? 6 : 0);

        if (p->i1) {
            loop_count += 4 * p->i1;
            generate_leaf_oo(&fp, loop_count, offsets_o, 7);
        }

        loop_count += 4;
        generate_leaf_oe(&fp, offsets_o);
    } else {
        generate_leaf_ee(&fp, offsets, N >= 256 ? 2 : 8);

        loop_count += 4;
        generate_leaf_eo(&fp, offsets);

        if (p->i1) {
            loop_count += 4 * p->i1;
            generate_leaf_oo(&fp, loop_count, offsets_o, N >= 256 ? 4 : 7);
        }
    }

    if (p->i1) {
        uint32_t offsets_oe[8] = {7*N, 3*N, N, 5*N, 0, 4*N, 6*N, 2*N};

        loop_count += 4 * p->i1;

        /* align loop/jump destination */
#ifdef _M_X64
        x86_mov_reg_imm(fp, X86_EBX, loop_count);
#else
        x86_mov_reg_imm(fp, X86_ECX, loop_count);
        ffts_align_mem16(&fp, 9);
#endif

        generate_leaf_ee(&fp, offsets_oe, 0);
    }

    generate_transform_init(&fp);

    /* generate subtransform calls */
    count = 2;
    while (pps[0]) {
        size_t ws_is;

        if (!pN) {
#ifdef _M_X64
            x86_mov_reg_imm(fp, X86_EBX, pps[0]);
#else
            x86_mov_reg_imm(fp, X86_ECX, pps[0] / 4);
#endif
        } else {
            int offset = (4 * pps[1]) - pAddr;
            if (offset) {
#ifdef _M_X64
                x64_alu_reg_imm_size(fp, X86_ADD, X64_R8, offset, 8);
#else
                x64_alu_reg_imm_size(fp, X86_ADD, X64_RDX, offset, 8);
#endif
            }

            if (pps[0] > leaf_N && pps[0] - pN) {
                int factor = ffts_ctzl(pps[0]) - ffts_ctzl(pN);

#ifdef _M_X64
                if (factor > 0) {
                    x86_shift_reg_imm(fp, X86_SHL, X86_EBX, factor);
                } else {
                    x86_shift_reg_imm(fp, X86_SHR, X86_EBX, -factor);
                }
#else
                if (factor > 0) {
                    x86_shift_reg_imm(fp, X86_SHL, X86_ECX, factor);
                } else {
                    x86_shift_reg_imm(fp, X86_SHR, X86_ECX, -factor);
                }
#endif
            }
        }

        ws_is = 8 * p->ws_is[ffts_ctzl(pps[0] / leaf_N) - 1];
        if (ws_is != pLUT) {
            int offset = (int) (ws_is - pLUT);

#ifdef _M_X64
            x64_alu_reg_imm_size(fp, X86_ADD, X64_R9, offset, 8);
#else
            x64_alu_reg_imm_size(fp, X86_ADD, X64_R8, offset, 8);
#endif
        }

        if (pps[0] == 2 * leaf_N) {
            x64_call_code(fp, x_4_addr);
        } else {
            x64_call_code(fp, x_8_addr);
        }

        pAddr = 4 * pps[1];
        if (pps[0] > leaf_N) {
            pN = pps[0];
        }

        pLUT = ws_is;//LUT_offset(pps[0], leafN);
        //fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
        count += 4;
        pps += 2;
    }
#endif

#ifdef __arm__
#ifdef HAVE_NEON
    if (ffts_ctzl(N) & 1) {
        ADDI(&fp, 2, 7, 0);
        ADDI(&fp, 7, 9, 0);
        ADDI(&fp, 9, 2, 0);

        ADDI(&fp, 2, 8, 0);
        ADDI(&fp, 8, 10, 0);
        ADDI(&fp, 10, 2, 0);

        if(p->i1) {
            MOVI(&fp, 11, p->i1);
            memcpy(fp, neon_oo, neon_eo - neon_oo);
            if(sign < 0) {
                fp[12] ^= 0x00200000;
                fp[13] ^= 0x00200000;
                fp[14] ^= 0x00200000;
                fp[15] ^= 0x00200000;
                fp[27] ^= 0x00200000;
                fp[29] ^= 0x00200000;
                fp[30] ^= 0x00200000;
                fp[31] ^= 0x00200000;
                fp[46] ^= 0x00200000;
                fp[47] ^= 0x00200000;
                fp[48] ^= 0x00200000;
                fp[57] ^= 0x00200000;
            }
            fp += (neon_eo - neon_oo) / 4;
        }

        *fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p));
        fp++;

        memcpy(fp, neon_oe, neon_end - neon_oe);
        if(sign < 0) {
            fp[19] ^= 0x00200000;
            fp[20] ^= 0x00200000;
            fp[22] ^= 0x00200000;
            fp[23] ^= 0x00200000;
            fp[37] ^= 0x00200000;
            fp[38] ^= 0x00200000;
            fp[40] ^= 0x00200000;
            fp[41] ^= 0x00200000;
            fp[64] ^= 0x00200000;
            fp[65] ^= 0x00200000;
            fp[66] ^= 0x00200000;
            fp[67] ^= 0x00200000;
        }
        fp += (neon_end - neon_oe) / 4;

    } else {

        *fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p));
        fp++;

        memcpy(fp, neon_eo, neon_oe - neon_eo);
        if(sign < 0) {
            fp[10] ^= 0x00200000;
            fp[11] ^= 0x00200000;
            fp[13] ^= 0x00200000;
            fp[14] ^= 0x00200000;
            fp[31] ^= 0x00200000;
            fp[33] ^= 0x00200000;
            fp[34] ^= 0x00200000;
            fp[35] ^= 0x00200000;
            fp[59] ^= 0x00200000;
            fp[60] ^= 0x00200000;
            fp[61] ^= 0x00200000;
            fp[62] ^= 0x00200000;
        }
        fp += (neon_oe - neon_eo) / 4;

        ADDI(&fp, 2, 7, 0);
        ADDI(&fp, 7, 9, 0);
        ADDI(&fp, 9, 2, 0);

        ADDI(&fp, 2, 8, 0);
        ADDI(&fp, 8, 10, 0);
        ADDI(&fp, 10, 2, 0);

        if(p->i1) {
            MOVI(&fp, 11, p->i1);
            memcpy(fp, neon_oo, neon_eo - neon_oo);
            if(sign < 0) {
                fp[12] ^= 0x00200000;
                fp[13] ^= 0x00200000;
                fp[14] ^= 0x00200000;
                fp[15] ^= 0x00200000;
                fp[27] ^= 0x00200000;
                fp[29] ^= 0x00200000;
                fp[30] ^= 0x00200000;
                fp[31] ^= 0x00200000;
                fp[46] ^= 0x00200000;
                fp[47] ^= 0x00200000;
                fp[48] ^= 0x00200000;
                fp[57] ^= 0x00200000;
            }
            fp += (neon_eo - neon_oo) / 4;
        }
    }

    if(p->i1) {
        ADDI(&fp, 2, 3, 0);
        ADDI(&fp, 3, 7, 0);
        ADDI(&fp, 7, 2, 0);

        ADDI(&fp, 2, 4, 0);
        ADDI(&fp, 4, 8, 0);
        ADDI(&fp, 8, 2, 0);

        ADDI(&fp, 2, 5, 0);
        ADDI(&fp, 5, 9, 0);
        ADDI(&fp, 9, 2, 0);

        ADDI(&fp, 2, 6, 0);
        ADDI(&fp, 6, 10, 0);
        ADDI(&fp, 10, 2, 0);

        ADDI(&fp, 2, 9, 0);
        ADDI(&fp, 9, 10, 0);
        ADDI(&fp, 10, 2, 0);

        *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p));
        fp++;
        MOVI(&fp, 11, p->i1);
        memcpy(fp, neon_ee, neon_oo - neon_ee);
        if(sign < 0) {
            fp[33] ^= 0x00200000;
            fp[37] ^= 0x00200000;
            fp[38] ^= 0x00200000;
            fp[39] ^= 0x00200000;
            fp[40] ^= 0x00200000;
            fp[41] ^= 0x00200000;
            fp[44] ^= 0x00200000;
            fp[45] ^= 0x00200000;
            fp[46] ^= 0x00200000;
            fp[47] ^= 0x00200000;
            fp[48] ^= 0x00200000;
            fp[57] ^= 0x00200000;
        }
        fp += (neon_oo - neon_ee) / 4;
    }
#else
    ADDI(&fp, 2, 7, 0);
    ADDI(&fp, 7, 9, 0);
    ADDI(&fp, 9, 2, 0);

    ADDI(&fp, 2, 8, 0);
    ADDI(&fp, 8, 10, 0);
    ADDI(&fp, 10, 2, 0);

    MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1);
    memcpy(fp, vfp_o, vfp_x4 - vfp_o);
    if(sign > 0) {
        fp[22] ^= 0x00000040;
        fp[24] ^= 0x00000040;
        fp[25] ^= 0x00000040;
        fp[26] ^= 0x00000040;
        fp[62] ^= 0x00000040;
        fp[64] ^= 0x00000040;
        fp[65] ^= 0x00000040;
        fp[66] ^= 0x00000040;
    }
    fp += (vfp_x4 - vfp_o) / 4;

    ADDI(&fp, 2, 3, 0);
    ADDI(&fp, 3, 7, 0);
    ADDI(&fp, 7, 2, 0);

    ADDI(&fp, 2, 4, 0);
    ADDI(&fp, 4, 8, 0);
    ADDI(&fp, 8, 2, 0);

    ADDI(&fp, 2, 5, 0);
    ADDI(&fp, 5, 9, 0);
    ADDI(&fp, 9, 2, 0);

    ADDI(&fp, 2, 6, 0);
    ADDI(&fp, 6, 10, 0);
    ADDI(&fp, 10, 2, 0);

    ADDI(&fp, 2, 9, 0);
    ADDI(&fp, 9, 10, 0);
    ADDI(&fp, 10, 2, 0);

    *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p));
    fp++;
    MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1);
    memcpy(fp, vfp_e, vfp_o - vfp_e);
    if(sign > 0) {
        fp[64] ^= 0x00000040;
        fp[65] ^= 0x00000040;
        fp[68] ^= 0x00000040;
        fp[75] ^= 0x00000040;
        fp[76] ^= 0x00000040;
        fp[79] ^= 0x00000040;
        fp[80] ^= 0x00000040;
        fp[83] ^= 0x00000040;
        fp[84] ^= 0x00000040;
        fp[87] ^= 0x00000040;
        fp[91] ^= 0x00000040;
        fp[93] ^= 0x00000040;
    }
    fp += (vfp_o - vfp_e) / 4;

#endif
    *fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p));
    fp++; // load offsets into r12
    //ADDI(&fp, 2, 1, 0);
    MOVI(&fp, 1, 0);

    // args: r0 - out
    //       r1 - N
    //       r2 - ws
    //	ADDI(&fp, 3, 1, 0); // put N into r3 for counter

    count = 2;
    while(pps[0]) {

        //	fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr);
        if(!pN) {
            MOVI(&fp, 1, pps[0]);
        } else {
            if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr);
            if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN);
        }

        if (p->ws_is[ffts_ctzl(pps[0]/leaf_N)-1]*8 - pLUT) {
            ADDI(&fp, 2, 2, p->ws_is[ffts_ctzl(pps[0]/leaf_N)-1]*8 - pLUT);
        }

        if(pps[0] == 2 * leaf_N) {
            *fp = BL(fp+2, x_4_addr);
            fp++;
        } else if(!pps[2]) {
            //uint32_t *x_8_t_addr = fp;
#ifdef HAVE_NEON
            memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
            if(sign < 0) {
                fp[31] ^= 0x00200000;
                fp[32] ^= 0x00200000;
                fp[33] ^= 0x00200000;
                fp[34] ^= 0x00200000;
                fp[65] ^= 0x00200000;
                fp[66] ^= 0x00200000;
                fp[70] ^= 0x00200000;
                fp[74] ^= 0x00200000;
                fp[97] ^= 0x00200000;
                fp[98] ^= 0x00200000;
                fp[102] ^= 0x00200000;
                fp[104] ^= 0x00200000;
            }
            fp += (neon_ee - neon_x8_t) / 4;
            //*fp++ = BL(fp+2, x_8_t_addr);

#else
            *fp = BL(fp+2, x_8_addr);
            fp++;
#endif
        } else {
            *fp = BL(fp+2, x_8_addr);
            fp++;
        }

        pAddr = pps[1] * 4;
        pN = pps[0];
        pLUT = p->ws_is[ffts_ctzl(pps[0]/leaf_N)-1]*8;//LUT_offset(pps[0], leafN);
        //	fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
        count += 4;
        pps += 2;
    }

    *fp++ = 0xecbd8b10;
    *fp++ = POP_LR();
    count++;
#else
    generate_epilogue(&fp);
#endif

    //	*fp++ = B(14); count++;

    //for(int i=0;i<(neon_x8 - neon_x4)/4;i++)
    //	fprintf(stderr, "%08x\n", x_4_addr[i]);
    //fprintf(stderr, "\n");
    //for(int i=0;i<count;i++)

    //fprintf(stderr, "size of transform %u = %d\n", N, (fp - x_8_addr) * sizeof(*fp));

    free(ps);

#if defined(_MSC_VER)
#pragma warning(push)

    /* disable type cast warning from data pointer to function pointer */
#pragma warning(disable : 4055)
#endif

    return (transform_func_t) start;

#if defined(_MSC_VER)
#pragma warning(pop)
#endif
}