Exemple #1
0
	void PPCXEmitter::Epilogue() {		
		u32 regSize = 8; // 4 in 32bit system
		u32 stackFrameSize = 0x1F0;

		//Break();

		// Write Epilogue (restore stack frame, return)
		// free stack
		ADDI(R1, R1, stackFrameSize);	
#if 0
		ADDI(R12, R1, -0x98);	

		// Restore fpr
		for(int i = 14; i < 32; i ++) {
			LFD((PPCReg)i, R1, -((32 - i) * regSize));
		}
#endif
		// Restore gpr
		for(int i = 14; i < 32; i ++) {
			LD((PPCReg)i, R1, -((33 - i) * regSize));
		}		

		// recover r12 (LR saved register)
		LWZ (R12, R1, -0x8);

		// Restore Lr
		MTLR(R12);
		
#if 1
		// load fpr buff
		MOVI2R(R5, (u32)&_fprTmp);
		
		// Load fpr
		for(int i = 14; i < 32; i ++) {
			LFD((PPCReg)i, R5, i * regSize);
		}
#endif
	}
Exemple #2
0
	void PPCXEmitter::Epilogue() {		
		u32 regSize = 8; // 4 in 32bit system
		u32 stackFrameSize = 32*32;//(35 - 12) * regSize;

		// Write Epilogue (restore stack frame, return)
		// free stack
		ADDI(R1, R1, stackFrameSize);	

		// Restore regs
		for(int i = 14; i < 32; i ++) {
			LD((PPCReg)i, R1, -((33 - i) * regSize));
		}

		// recover r12 (LR saved register)
		LWZ (R12, R1, -0x8);

		// Restore Lr
		MTLR(R12);
	}
Exemple #3
0
	void PPCXEmitter::Prologue() {
		// Save regs
		u32 regSize = 8; // 4 in 32bit system
		u32 stackFrameSize = 0x1F0;

		// Write Prologue (setup stack frame etc ...)
		// Save Lr
		MFLR(R12);

		// Save gpr
		for(int i = 14; i < 32; i ++) {
			STD((PPCReg)i, R1, -((33 - i) * regSize));
		}

		// Save r12
		STW(R12, R1, -0x8);
#if 0
		// add fpr frame
		ADDI(R12, R1, -0x98);
		
		// Load fpr
		for(int i = 14; i < 32; i ++) {
			SFD((PPCReg)i, R1, -((32 - i) * regSize));
		}
#endif
		// allocate stack
		STWU(R1, R1, -stackFrameSize);		

#if 1
		// load fpr buff
		MOVI2R(R10, (u32)&_fprTmp);
		
		// Save fpr
		for(int i = 14; i < 32; i ++) {
			SFD((PPCReg)i, R10, i * regSize);
		}
#endif
	}
Exemple #4
0
/* Process 2OPI Integer instructions */
bool eval_2OPI_Int(struct lilith* vm, struct Instruction* c)
{
	#ifdef DEBUG
	char Name[20] = "ILLEGAL_2OPI";
	#endif

	/* 0x0E ... 0x2B */
	/* 0xB0 ... 0xDF */
	switch(c->raw2)
	{
		case 0x0E: /* ADDI */
		{
			#ifdef DEBUG
			strncpy(Name, "ADDI", 19);
			#elif TRACE
			record_trace("ADDI");
			#endif

			ADDI(vm, c);
			break;
		}
		case 0x0F: /* ADDUI */
		{
			#ifdef DEBUG
			strncpy(Name, "ADDUI", 19);
			#elif TRACE
			record_trace("ADDUI");
			#endif

			ADDUI(vm, c);
			break;
		}
		case 0x10: /* SUBI */
		{
			#ifdef DEBUG
			strncpy(Name, "SUBI", 19);
			#elif TRACE
			record_trace("SUBI");
			#endif

			SUBI(vm, c);
			break;
		}
		case 0x11: /* SUBUI */
		{
			#ifdef DEBUG
			strncpy(Name, "SUBUI", 19);
			#elif TRACE
			record_trace("SUBUI");
			#endif

			SUBUI(vm, c);
			break;
		}
		case 0x12: /* CMPI */
		{
			#ifdef DEBUG
			strncpy(Name, "CMPI", 19);
			#elif TRACE
			record_trace("CMPI");
			#endif

			CMPI(vm, c);
			break;
		}
		case 0x13: /* LOAD */
		{
			#ifdef DEBUG
			strncpy(Name, "LOAD", 19);
			#elif TRACE
			record_trace("LOAD");
			#endif

			LOAD(vm, c);
			break;
		}
		case 0x14: /* LOAD8 */
		{
			#ifdef DEBUG
			strncpy(Name, "LOAD8", 19);
			#elif TRACE
			record_trace("LOAD8");
			#endif

			LOAD8(vm, c);
			break;
		}
		case 0x15: /* LOADU8 */
		{
			#ifdef DEBUG
			strncpy(Name, "LOADU8", 19);
			#elif TRACE
			record_trace("LOADU8");
			#endif

			LOADU8(vm, c);
			break;
		}
		case 0x16: /* LOAD16 */
		{
			#ifdef DEBUG
			strncpy(Name, "LOAD16", 19);
			#elif TRACE
			record_trace("LOAD16");
			#endif

			LOAD16(vm, c);
			break;
		}
		case 0x17: /* LOADU16 */
		{
			#ifdef DEBUG
			strncpy(Name, "LOADU16", 19);
			#elif TRACE
			record_trace("LOADU16");
			#endif

			LOADU16(vm, c);
			break;
		}
		case 0x18: /* LOAD32 */
		{
			#ifdef DEBUG
			strncpy(Name, "LOAD32", 19);
			#elif TRACE
			record_trace("LOAD32");
			#endif

			LOAD32(vm, c);
			break;
		}
		case 0x19: /* LOADU32 */
		{
			#ifdef DEBUG
			strncpy(Name, "LOADU32", 19);
			#elif TRACE
			record_trace("LOADU32");
			#endif

			LOADU32(vm, c);
			break;
		}
		case 0x1F: /* CMPUI */
		{
			#ifdef DEBUG
			strncpy(Name, "CMPUI", 19);
			#elif TRACE
			record_trace("CMPUI");
			#endif

			CMPUI(vm, c);
			break;
		}
		case 0x20: /* STORE */
		{
			#ifdef DEBUG
			strncpy(Name, "STORE", 19);
			#elif TRACE
			record_trace("STORE");
			#endif

			STORE(vm, c);
			break;
		}
		case 0x21: /* STORE8 */
		{
			#ifdef DEBUG
			strncpy(Name, "STORE8", 19);
			#elif TRACE
			record_trace("STORE8");
			#endif

			STORE8(vm, c);
			break;
		}
		case 0x22: /* STORE16 */
		{
			#ifdef DEBUG
			strncpy(Name, "STORE16", 19);
			#elif TRACE
			record_trace("STORE16");
			#endif

			STORE16(vm, c);
			break;
		}
		case 0x23: /* STORE32 */
		{
			#ifdef DEBUG
			strncpy(Name, "STORE32", 19);
			#elif TRACE
			record_trace("STORE32");
			#endif

			STORE32(vm, c);
			break;
		}
		case 0xB0: /* ANDI */
		{
			#ifdef DEBUG
			strncpy(Name, "ANDI", 19);
			#elif TRACE
			record_trace("ANDI");
			#endif

			ANDI(vm, c);
			break;
		}
		case 0xB1: /* ORI */
		{
			#ifdef DEBUG
			strncpy(Name, "ORI", 19);
			#elif TRACE
			record_trace("ORI");
			#endif

			ORI(vm, c);
			break;
		}
		case 0xB2: /* XORI */
		{
			#ifdef DEBUG
			strncpy(Name, "XORI", 19);
			#elif TRACE
			record_trace("XORI");
			#endif

			XORI(vm, c);
			break;
		}
		case 0xB3: /* NANDI */
		{
			#ifdef DEBUG
			strncpy(Name, "NANDI", 19);
			#elif TRACE
			record_trace("NANDI");
			#endif

			NANDI(vm, c);
			break;
		}
		case 0xB4: /* NORI */
		{
			#ifdef DEBUG
			strncpy(Name, "NORI", 19);
			#elif TRACE
			record_trace("NORI");
			#endif

			NORI(vm, c);
			break;
		}
		case 0xB5: /* XNORI */
		{
			#ifdef DEBUG
			strncpy(Name, "XNORI", 19);
			#elif TRACE
			record_trace("XNORI");
			#endif

			XNORI(vm, c);
			break;
		}
		case 0xC0: /* CMPJUMPI.G */
		{
			#ifdef DEBUG
			strncpy(Name, "CMPJUMPI.G", 19);
			#elif TRACE
			record_trace("CMPJUMPI.G");
			#endif

			CMPJUMPI_G(vm, c);
			break;
		}
		case 0xC1: /* CMPJUMPI.GE */
		{
			#ifdef DEBUG
			strncpy(Name, "CMPJUMPI.GE", 19);
			#elif TRACE
			record_trace("CMPJUMPI.GE");
			#endif

			CMPJUMPI_GE(vm, c);
			break;
		}
		case 0xC2: /* CMPJUMPI.E */
		{
			#ifdef DEBUG
			strncpy(Name, "CMPJUMPI.E", 19);
			#elif TRACE
			record_trace("CMPJUMPI.E");
			#endif

			CMPJUMPI_E(vm, c);
			break;
		}
		case 0xC3: /* CMPJUMPI.NE */
		{
			#ifdef DEBUG
			strncpy(Name, "CMPJUMPI.NE", 19);
			#elif TRACE
			record_trace("CMPJUMPI.NE");
			#endif

			CMPJUMPI_NE(vm, c);
			break;
		}
		case 0xC4: /* CMPJUMPI.LE */
		{
			#ifdef DEBUG
			strncpy(Name, "CMPJUMPI.LE", 19);
			#elif TRACE
			record_trace("CMPJUMPI.LE");
			#endif

			CMPJUMPI_LE(vm, c);
			break;
		}
		case 0xC5: /* CMPJUMPI.L */
		{
			#ifdef DEBUG
			strncpy(Name, "CMPJUMPI.L", 19);
			#elif TRACE
			record_trace("CMPJUMPI.L");
			#endif

			CMPJUMPI_L(vm, c);
			break;
		}
		case 0xD0: /* CMPJUMPUI.G */
		{
			#ifdef DEBUG
			strncpy(Name, "CMPJUMPUI.G", 19);
			#elif TRACE
			record_trace("CMPJUMPUI.G");
			#endif

			CMPJUMPUI_G(vm, c);
			break;
		}
		case 0xD1: /* CMPJUMPUI.GE */
		{
			#ifdef DEBUG
			strncpy(Name, "CMPJUMPUI.GE", 19);
			#elif TRACE
			record_trace("CMPJUMPUI.GE");
			#endif

			CMPJUMPUI_GE(vm, c);
			break;
		}
		case 0xD4: /* CMPJUMPUI.LE */
		{
			#ifdef DEBUG
			strncpy(Name, "CMPJUMPUI.LE", 19);
			#elif TRACE
			record_trace("CMPJUMPUI.LE");
			#endif

			CMPJUMPUI_LE(vm, c);
			break;
		}
		case 0xD5: /* CMPJUMPUI.L */
		{
			#ifdef DEBUG
			strncpy(Name, "CMPJUMPUI.L", 19);
			#elif TRACE
			record_trace("CMPJUMPUI.L");
			#endif

			CMPJUMPUI_L(vm, c);
			break;
		}
		default:
		{
			illegal_instruction(vm, c);
			break;
		}
	}
	#ifdef DEBUG
	fprintf(stdout, "# %s reg%u reg%u %i\n", Name, c->reg0, c->reg1, c->raw_Immediate);
	#endif
	return false;
}
Exemple #5
0
{
	// Read umd patches
	{ psp_read, STDU(SP, 0xFF90, SP), &condition_psp_iso },
	{ psp_read+4, MFLR(R0), &condition_psp_iso },
	{ psp_read+8, STD(R0, 0x80, SP), &condition_psp_iso },
	{ psp_read+0x0C, MR(R8, R7), &condition_psp_iso },
	{ psp_read+0x10, MR(R7, R6), &condition_psp_iso },
	{ psp_read+0x14, MR(R6, R5), &condition_psp_iso },
	{ psp_read+0x18, MR(R5, R4), &condition_psp_iso },
	{ psp_read+0x1C, MR(R4, R3), &condition_psp_iso },
	{ psp_read+0x20, LI(R3, SYSCALL8_OPCODE_READ_PSP_UMD), &condition_psp_iso },
	{ psp_read+0x24, LI(R11, 8), &condition_psp_iso },
	{ psp_read+0x28, SC, &condition_psp_iso },
	{ psp_read+0x2C, LD(R0, 0x80, SP), &condition_psp_iso },
	{ psp_read+0x30, MTLR(R0), &condition_psp_iso },
	{ psp_read+0x34, ADDI(SP, SP, 0x70), &condition_psp_iso },
	{ psp_read+0x38, BLR, &condition_psp_iso },
	// Read header patches
	{ psp_read+0x3C, STDU(SP, 0xFF90, SP), &condition_psp_iso },
	{ psp_read+0x40, MFLR(R0), &condition_psp_iso },
	{ psp_read+0x44, STD(R0, 0x80, SP), &condition_psp_iso },
	{ psp_read+0x48, MR(R7, R6), &condition_psp_iso },
	{ psp_read+0x4C, MR(R6, R5), &condition_psp_iso },
	{ psp_read+0x50, MR(R5, R4), &condition_psp_iso },
	{ psp_read+0x54, MR(R4, R3), &condition_psp_iso },
	{ psp_read+0x58, LI(R3, SYSCALL8_OPCODE_READ_PSP_HEADER), &condition_psp_iso },
	{ psp_read+0x5C, LI(R11, 8), &condition_psp_iso },
	{ psp_read+0x60, SC, &condition_psp_iso },
	{ psp_read+0x64, LD(R0, 0x80, SP), &condition_psp_iso },
	{ psp_read+0x68, MTLR(R0), &condition_psp_iso },
	{ psp_read+0x6C, ADDI(SP, SP, 0x70), &condition_psp_iso },
Exemple #6
0
void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
	int count = tree_count(N, leafN, 0) + 1;
	size_t *ps = malloc(count * 2 * sizeof(size_t));
	size_t *pps = ps;

#ifdef __x86_64__
	if(sign < 0) p->constants = sse_constants;
	else         p->constants = sse_constants_inv;
#endif

	elaborate_tree(&pps, N, leafN, 0);
	pps[0] = 0;
	pps[1] = 0;

	pps = ps;

#ifdef __arm__ 
	if(N < 8192) p->transform_size = 8192;
	else p->transform_size = N;
#else
	if(N < 2048) p->transform_size = 16384;
	else p->transform_size = 16384 + 2*N/8 * __builtin_ctzl(N);
#endif

#ifdef __APPLE__
	p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANON | MAP_SHARED, -1, 0);
#else
#define MAP_ANONYMOUS 0x20	
	p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
#endif

/*
	if(p->transform_base == MAP_FAILED) {
		fprintf(stderr, "MAP FAILED\n");
		exit(1);
	}*/
	insns_t *func = p->transform_base;//valloc(8192);
	insns_t *fp = func;

//fprintf(stderr, "Allocating %d bytes \n", p->transform_size);
//fprintf(stderr, "Base address = %016p\n", func);

	if(!func) { 
		fprintf(stderr, "NOMEM\n");
		exit(1);
	}

	insns_t *x_8_addr = fp;
#ifdef __arm__
#ifdef HAVE_NEON
	memcpy(fp, neon_x8, neon_x8_t - neon_x8);
	if(sign < 0) {
		fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
		fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000;
		fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000;
	}
	fp += (neon_x8_t - neon_x8) / 4;
#else
	memcpy(fp, vfp_x8, vfp_end - vfp_x8);
	if(sign > 0) {
		fp[65] ^= 0x00000040; 
		fp[66] ^= 0x00000040; 
		fp[68] ^= 0x00000040; 
		fp[70] ^= 0x00000040; 
  	fp[103] ^= 0x00000040; 
  	fp[104] ^= 0x00000040; 
  	fp[105] ^= 0x00000040; 
  	fp[108] ^= 0x00000040; 
  	fp[113] ^= 0x00000040; 
  	fp[114] ^= 0x00000040; 
  	fp[117] ^= 0x00000040; 
  	fp[118] ^= 0x00000040; 
	}
	fp += (vfp_end - vfp_x8) / 4;
#endif
#else
	align_mem16(&fp, 0);
	x_8_addr = fp;
	align_mem16(&fp, 5);
	memcpy(fp, x8_soft, x8_hard - x8_soft);
	fp += (x8_hard - x8_soft);
//fprintf(stderr, "X8 start address = %016p\n", x_8_addr);
#endif
//uint32_t *x_8_t_addr = fp;
//memcpy(fp, neon_x8_t, neon_end - neon_x8_t);
//fp += (neon_end - neon_x8_t) / 4;
	insns_t *x_4_addr = fp;
#ifdef __arm__

#ifdef HAVE_NEON
	memcpy(fp, neon_x4, neon_x8 - neon_x4);
	if(sign < 0) {
		fp[26] ^= 0x00200000; fp[28] ^= 0x00200000; fp[31] ^= 0x00200000; fp[32] ^= 0x00200000;
	}
	fp += (neon_x8 - neon_x4) / 4;
#else
	memcpy(fp, vfp_x4, vfp_x8 - vfp_x4);
	if(sign > 0) {
		fp[36] ^= 0x00000040; 
		fp[38] ^= 0x00000040; 
		fp[43] ^= 0x00000040; 
		fp[44] ^= 0x00000040;
	}
	fp += (vfp_x8 - vfp_x4) / 4;
#endif
#else
	align_mem16(&fp, 0);
	x_4_addr = fp;
	memcpy(fp, x4, x8_soft - x4);
	fp += (x8_soft - x4);

#endif
	insns_t *start = fp;

#ifdef __arm__ 
	*fp = PUSH_LR(); fp++;
	*fp = 0xed2d8b10; fp++;

	ADDI(&fp, 3, 1, 0);
	ADDI(&fp, 7, 1, N);
	ADDI(&fp, 5, 1, 2*N);
	ADDI(&fp, 10, 7, 2*N);
	ADDI(&fp, 4, 5, 2*N);
	ADDI(&fp, 8, 10, 2*N);
	ADDI(&fp, 6, 4, 2*N);
	ADDI(&fp, 9, 8, 2*N);
	
  *fp = LDRI(12, 0, ((uint32_t)&p->offsets) - ((uint32_t)p)); fp++; // load offsets into r12
//  *fp++ = LDRI(1, 0, 4); // load ws into r1
	ADDI(&fp, 1, 0, 0);

	ADDI(&fp, 0, 2, 0), // mov out into r0
#endif


#ifdef __arm__
		*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; 
  	#ifdef HAVE_NEON
  		MOVI(&fp, 11, p->i0);
  	#else 
  		MOVI(&fp, 11, p->i0);
  	#endif

#else
	align_mem16(&fp, 0);
	start = fp;
	
	*fp++ = 0x4c;
	*fp++ = 0x8b;
	*fp++ = 0x07;
	uint32_t lp_cnt = p->i0 * 4;
	MOVI(&fp, RCX, lp_cnt);
	
	//LEA(&fp, R8, RDI, ((uint32_t)&p->offsets) - ((uint32_t)p)); 
#endif
	//fp++;
#ifdef __arm__
#ifdef HAVE_NEON
	memcpy(fp, neon_ee, neon_oo - neon_ee);
	if(sign < 0) {
		fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
		fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
		fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
	}
	fp += (neon_oo - neon_ee) / 4;
#else
		memcpy(fp, vfp_e, vfp_o - vfp_e);
		if(sign > 0) {
			fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040;
			fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040;
			fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040;
		}
		fp += (vfp_o - vfp_e) / 4;
#endif
#else
//fprintf(stderr, "Body start address = %016p\n", start);
	
	PUSH(&fp, RBP);	
	PUSH(&fp, RBX);
	PUSH(&fp, R10);
	PUSH(&fp, R11);
	PUSH(&fp, R12);
	PUSH(&fp, R13);
	PUSH(&fp, R14);
	PUSH(&fp, R15);
	
	int i;
	memcpy(fp, leaf_ee_init, leaf_ee - leaf_ee_init);
	
//fprintf(stderr, "Leaf ee init address = %016p\n", leaf_ee_init);
//fprintf(stderr, "Constants address = %016p\n", sse_constants);
//fprintf(stderr, "Constants address = %016p\n", p->constants);
	
//int32_t val = READ_IMM32(fp + 3);
//fprintf(stderr, "diff = 0x%x\n", ((uint32_t)&p->constants) - ((uint32_t)p));

//int64_t v2 = val + (int64_t)((void *)leaf_ee_init - (void *)fp );
//fprintf(stderr, "IMM = 0x%llx\n", v2);

//IMM32_NI(fp + 3, ((int64_t) READ_IMM32(fp + 3)) + ((void *)leaf_ee_init - (void *)fp )); 
	fp += (leaf_ee - leaf_ee_init);

//fprintf(stderr, "Leaf start address = %016p\n", fp);
	align_mem16(&fp, 9);
	memcpy(fp, leaf_ee, leaf_oo - leaf_ee);


	uint32_t offsets[8] = {0, N, N/2, 3*N/2, N/4, 5*N/4, 7*N/4, 3*N/4};
	uint32_t offsets_o[8] = {0, N, N/2, 3*N/2, 7*N/4, 3*N/4, N/4, 5*N/4};
	uint32_t offsets_oe[8] = {7*N/4, 3*N/4, N/4, 5*N/4, 0, N, 3*N/2, N/2};
	
	for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets[i]*4); 
	
	fp += (leaf_oo - leaf_ee);
	
	if(__builtin_ctzl(N) & 1){
		if(p->i1) {
			lp_cnt += p->i1 * 4;
  		MOVI(&fp, RCX, lp_cnt);
			align_mem16(&fp, 4);
  		memcpy(fp, leaf_oo, leaf_eo - leaf_oo);
  		for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4); 
  		fp += (leaf_eo - leaf_oo);
		}
		

  	memcpy(fp, leaf_oe, leaf_end - leaf_oe);
  	lp_cnt += 4;
  	for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oe_offsets[i], offsets_o[i]*4); 
  	fp += (leaf_end - leaf_oe);

	}else{


		memcpy(fp, leaf_eo, leaf_oe - leaf_eo);
		lp_cnt += 4;
		for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_eo_offsets[i], offsets[i]*4); 
		fp += (leaf_oe - leaf_eo);

  	if(p->i1) {
			lp_cnt += p->i1 * 4;
  		MOVI(&fp, RCX, lp_cnt);
			align_mem16(&fp, 4);
  		memcpy(fp, leaf_oo, leaf_eo - leaf_oo);
  		for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4); 
  		fp += (leaf_eo - leaf_oo);
  	}

	}
	if(p->i1) {

		lp_cnt += p->i1 * 4;
		MOVI(&fp, RCX, lp_cnt);
		align_mem16(&fp, 9);
		memcpy(fp, leaf_ee, leaf_oo - leaf_ee);
		for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets_oe[i]*4); 
		fp += (leaf_oo - leaf_ee);

	}
	
//fprintf(stderr, "Body start address = %016p\n", fp);
  //LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p)); 
	memcpy(fp, x_init, x4 - x_init);
//IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp )); 
	fp += (x4 - x_init);

	int32_t pAddr = 0;
	int32_t pN = 0;
	int32_t pLUT = 0;
	count = 2;
	while(pps[0]) {
	
		if(!pN) {
			MOVI(&fp, RCX, pps[0] / 4);
		}else{
  		if((pps[1]*4)-pAddr) ADDI(&fp, RDX, (pps[1] * 4)- pAddr);
			if(pps[0] > leafN && pps[0] - pN) {
				
				int diff = __builtin_ctzl(pps[0]) - __builtin_ctzl(pN);
				*fp++ = 0xc1; 
				
				if(diff > 0) {
					*fp++ = 0xe1;
					*fp++ = (diff & 0xff);
				}else{
					*fp++ = 0xe9;
					*fp++ = ((-diff) & 0xff);
				}
			}
		}
		
  		if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
  			ADDI(&fp, R8, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT); 


  	if(pps[0] == 2*leafN) {
      CALL(&fp, x_4_addr);
//  	}else if(!pps[2]){
//	  //uint32_t *x_8_t_addr = fp;
//		memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
//		fp += (neon_ee - neon_x8_t) / 4;
//		//*fp++ = BL(fp+2, x_8_t_addr);
  	}else{
    		CALL(&fp, x_8_addr);
  	}

		pAddr = pps[1] * 4;
		if(pps[0] > leafN) 
			pN = pps[0];
		pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
//	fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); 
		count += 4;
		pps += 2;
	}
#endif
#ifdef __arm__
#ifdef HAVE_NEON
	if(__builtin_ctzl(N) & 1){
		ADDI(&fp, 2, 7, 0);
		ADDI(&fp, 7, 9, 0);
		ADDI(&fp, 9, 2, 0);

		ADDI(&fp, 2, 8, 0);
		ADDI(&fp, 8, 10, 0);
		ADDI(&fp, 10, 2, 0);
	
		if(p->i1) {
			MOVI(&fp, 11, p->i1);
			memcpy(fp, neon_oo, neon_eo - neon_oo);
			if(sign < 0) {
				fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000;
				fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000;
				fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
			}
			fp += (neon_eo - neon_oo) / 4;
		}
		
		*fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p)); fp++; 

		memcpy(fp, neon_oe, neon_end - neon_oe);
		if(sign < 0) {
			fp[19] ^= 0x00200000; fp[20] ^= 0x00200000; fp[22] ^= 0x00200000; fp[23] ^= 0x00200000;
			fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[40] ^= 0x00200000; fp[41] ^= 0x00200000;
			fp[64] ^= 0x00200000; fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[67] ^= 0x00200000;
		}
		fp += (neon_end - neon_oe) / 4;

	}else{
		
		*fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p)); fp++; 

		memcpy(fp, neon_eo, neon_oe - neon_eo);
		if(sign < 0) {
			fp[10] ^= 0x00200000; fp[11] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000;
			fp[31] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; fp[35] ^= 0x00200000;
			fp[59] ^= 0x00200000; fp[60] ^= 0x00200000; fp[61] ^= 0x00200000; fp[62] ^= 0x00200000;
		}
		fp += (neon_oe - neon_eo) / 4;
		
		ADDI(&fp, 2, 7, 0);
		ADDI(&fp, 7, 9, 0);
		ADDI(&fp, 9, 2, 0);

		ADDI(&fp, 2, 8, 0);
		ADDI(&fp, 8, 10, 0);
		ADDI(&fp, 10, 2, 0);
	
		if(p->i1) {
			MOVI(&fp, 11, p->i1);
			memcpy(fp, neon_oo, neon_eo - neon_oo);
			if(sign < 0) {
				fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000;
				fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000;
				fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
			}
			fp += (neon_eo - neon_oo) / 4;
		}

	}


	if(p->i1) {
		ADDI(&fp, 2, 3, 0);
		ADDI(&fp, 3, 7, 0);
		ADDI(&fp, 7, 2, 0);

		ADDI(&fp, 2, 4, 0);
		ADDI(&fp, 4, 8, 0);
		ADDI(&fp, 8, 2, 0);
		
		ADDI(&fp, 2, 5, 0);
		ADDI(&fp, 5, 9, 0);
		ADDI(&fp, 9, 2, 0);

		ADDI(&fp, 2, 6, 0);
		ADDI(&fp, 6, 10, 0);
		ADDI(&fp, 10, 2, 0);

		ADDI(&fp, 2, 9, 0);
		ADDI(&fp, 9, 10, 0);
		ADDI(&fp, 10, 2, 0);

		*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; 
	  MOVI(&fp, 11, p->i1);
  	memcpy(fp, neon_ee, neon_oo - neon_ee);
  	if(sign < 0) {
  		fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
  		fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
  		fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
  	}
		fp += (neon_oo - neon_ee) / 4;

	}
#else
		ADDI(&fp, 2, 7, 0);
		ADDI(&fp, 7, 9, 0);
		ADDI(&fp, 9, 2, 0);

		ADDI(&fp, 2, 8, 0);
		ADDI(&fp, 8, 10, 0);
		ADDI(&fp, 10, 2, 0);
	
			MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1);
  		memcpy(fp, vfp_o, vfp_x4 - vfp_o);
		if(sign > 0) {
			fp[22] ^= 0x00000040; fp[24] ^= 0x00000040; fp[25] ^= 0x00000040; fp[26] ^= 0x00000040;
			fp[62] ^= 0x00000040; fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[66] ^= 0x00000040;
		}
  		fp += (vfp_x4 - vfp_o) / 4;
		
		ADDI(&fp, 2, 3, 0);
		ADDI(&fp, 3, 7, 0);
		ADDI(&fp, 7, 2, 0);

		ADDI(&fp, 2, 4, 0);
		ADDI(&fp, 4, 8, 0);
		ADDI(&fp, 8, 2, 0);
		
		ADDI(&fp, 2, 5, 0);
		ADDI(&fp, 5, 9, 0);
		ADDI(&fp, 9, 2, 0);

		ADDI(&fp, 2, 6, 0);
		ADDI(&fp, 6, 10, 0);
		ADDI(&fp, 10, 2, 0);

		ADDI(&fp, 2, 9, 0);
		ADDI(&fp, 9, 10, 0);
		ADDI(&fp, 10, 2, 0);

		*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; 
	  MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1);
  	memcpy(fp, vfp_e, vfp_o - vfp_e);
		if(sign > 0) {
			fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040;
			fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040;
			fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040;
		}
  	fp += (vfp_o - vfp_e) / 4;

#endif
  *fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); fp++; // load offsets into r12
	//ADDI(&fp, 2, 1, 0);
	MOVI(&fp, 1, 0);
	
	// args: r0 - out
	//       r1 - N
	//       r2 - ws
//	ADDI(&fp, 3, 1, 0); // put N into r3 for counter

	int32_t pAddr = 0;
	int32_t pN = 0;
	int32_t pLUT = 0;
	count = 2;
	while(pps[0]) {
	
//	fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr);
		if(!pN) {
			MOVI(&fp, 1, pps[0]);
		}else{
  		if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr);
			if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN);
		}
		
		if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
			ADDI(&fp, 2, 2, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT); 


  	if(pps[0] == 2*leafN) {
        *fp = BL(fp+2, x_4_addr); fp++;
  	}else if(!pps[2]){
  	  //uint32_t *x_8_t_addr = fp;
#ifdef HAVE_NEON
  		memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
  		if(sign < 0) {
  			fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
  			fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000;
   			fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000;
  		}
  		fp += (neon_ee - neon_x8_t) / 4;
  		//*fp++ = BL(fp+2, x_8_t_addr);

#else
    		*fp = BL(fp+2, x_8_addr); fp++;
#endif
  	}else{
    		*fp = BL(fp+2, x_8_addr); fp++;
  	}

		pAddr = pps[1] * 4;
		pN = pps[0];
		pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
//	fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); 
		count += 4;
		pps += 2;
	}
	
	*fp++ = 0xecbd8b10;
	*fp++ = POP_LR(); count++;
#else
	POP(&fp, R15);
	POP(&fp, R14);
	POP(&fp, R13);
	POP(&fp, R12);
	POP(&fp, R11);
	POP(&fp, R10);
	POP(&fp, RBX);
	POP(&fp, RBP);
	RET(&fp);	


//uint8_t *pp = func;
//int counter = 0;
//do{ 
//	printf("%02x ", *pp);
//	if(counter++ % 16 == 15) printf("\n");
//} while(++pp < fp);

//printf("\n");


#endif


//	*fp++ = B(14); count++;

//for(int i=0;i<(neon_x8 - neon_x4)/4;i++) 
//	fprintf(stderr, "%08x\n", x_4_addr[i]);
//fprintf(stderr, "\n");
//for(int i=0;i<count;i++) 

	free(ps);
	
  if (mprotect(func, p->transform_size, PROT_READ | PROT_EXEC)) {
  	perror("Couldn't mprotect");
  	exit(1);
  }
#ifdef __APPLE__
	sys_icache_invalidate(func, p->transform_size);
#elif __ANDROID__
	cacheflush((long)(func), (long)(func) + p->transform_size, 0);
#elif __linux__
#ifdef __GNUC__
	__clear_cache((long)(func), (long)(func) + p->transform_size);
#endif
#endif

//fprintf(stderr, "size of transform %zu = %d\n", N, (fp-func)*4);

	p->transform = (void *) (start);
}
Exemple #7
0
transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
{
    uint32_t offsets[8] = {0, 4*N, 2*N, 6*N, N, 5*N, 7*N, 3*N};
    uint32_t offsets_o[8] = {0, 4*N, 2*N, 6*N, 7*N, 3*N, N, 5*N};

    int32_t pAddr = 0;
    int32_t pN = 0;
    int32_t pLUT = 0;

    insns_t  *fp;
    insns_t  *start;
    insns_t  *x_4_addr;
    insns_t  *x_8_addr;
    uint32_t  loop_count;

    int       count;
    ptrdiff_t len;

    size_t   *ps;
    size_t   *pps;

    count = ffts_tree_count(N, leaf_N, 0) + 1;

    ps = pps = malloc(2 * count * sizeof(*ps));
    if (!ps) {
        return NULL;
    }

    ffts_elaborate_tree(&pps, N, leaf_N, 0);

    pps[0] = 0;
    pps[1] = 0;

    pps = ps;

#ifdef HAVE_SSE
    if (sign < 0) {
        p->constants = (const void*) sse_constants;
    } else {
        p->constants = (const void*) sse_constants_inv;
    }
#endif

    fp = (insns_t*) p->transform_base;

    /* generate base cases */
    x_4_addr = generate_size4_base_case(&fp, sign);
    x_8_addr = generate_size8_base_case(&fp, sign);

#ifdef __arm__
    start = generate_prologue(&fp, p);

#ifdef HAVE_NEON
    memcpy(fp, neon_ee, neon_oo - neon_ee);
    if (sign < 0) {
        fp[33] ^= 0x00200000;
        fp[37] ^= 0x00200000;
        fp[38] ^= 0x00200000;
        fp[39] ^= 0x00200000;
        fp[40] ^= 0x00200000;
        fp[41] ^= 0x00200000;
        fp[44] ^= 0x00200000;
        fp[45] ^= 0x00200000;
        fp[46] ^= 0x00200000;
        fp[47] ^= 0x00200000;
        fp[48] ^= 0x00200000;
        fp[57] ^= 0x00200000;
    }

    fp += (neon_oo - neon_ee) / 4;
#else
    memcpy(fp, vfp_e, vfp_o - vfp_e);

    if (sign > 0) {
        fp[64] ^= 0x00000040;
        fp[65] ^= 0x00000040;
        fp[68] ^= 0x00000040;
        fp[75] ^= 0x00000040;
        fp[76] ^= 0x00000040;
        fp[79] ^= 0x00000040;
        fp[80] ^= 0x00000040;
        fp[83] ^= 0x00000040;
        fp[84] ^= 0x00000040;
        fp[87] ^= 0x00000040;
        fp[91] ^= 0x00000040;
        fp[93] ^= 0x00000040;
    }
    fp += (vfp_o - vfp_e) / 4;
#endif
#else
    /* generate functions */
    start = generate_prologue(&fp, p);

    loop_count = 4 * p->i0;
    generate_leaf_init(&fp, loop_count);

    if (ffts_ctzl(N) & 1) {
        generate_leaf_ee(&fp, offsets, p->i1 ? 6 : 0);

        if (p->i1) {
            loop_count += 4 * p->i1;
            generate_leaf_oo(&fp, loop_count, offsets_o, 7);
        }

        loop_count += 4;
        generate_leaf_oe(&fp, offsets_o);
    } else {
        generate_leaf_ee(&fp, offsets, N >= 256 ? 2 : 8);

        loop_count += 4;
        generate_leaf_eo(&fp, offsets);

        if (p->i1) {
            loop_count += 4 * p->i1;
            generate_leaf_oo(&fp, loop_count, offsets_o, N >= 256 ? 4 : 7);
        }
    }

    if (p->i1) {
        uint32_t offsets_oe[8] = {7*N, 3*N, N, 5*N, 0, 4*N, 6*N, 2*N};

        loop_count += 4 * p->i1;

        /* align loop/jump destination */
#ifdef _M_X64
        x86_mov_reg_imm(fp, X86_EBX, loop_count);
#else
        x86_mov_reg_imm(fp, X86_ECX, loop_count);
        ffts_align_mem16(&fp, 9);
#endif

        generate_leaf_ee(&fp, offsets_oe, 0);
    }

    generate_transform_init(&fp);

    /* generate subtransform calls */
    count = 2;
    while (pps[0]) {
        size_t ws_is;

        if (!pN) {
#ifdef _M_X64
            x86_mov_reg_imm(fp, X86_EBX, pps[0]);
#else
            x86_mov_reg_imm(fp, X86_ECX, pps[0] / 4);
#endif
        } else {
            int offset = (4 * pps[1]) - pAddr;
            if (offset) {
#ifdef _M_X64
                x64_alu_reg_imm_size(fp, X86_ADD, X64_R8, offset, 8);
#else
                x64_alu_reg_imm_size(fp, X86_ADD, X64_RDX, offset, 8);
#endif
            }

            if (pps[0] > leaf_N && pps[0] - pN) {
                int factor = ffts_ctzl(pps[0]) - ffts_ctzl(pN);

#ifdef _M_X64
                if (factor > 0) {
                    x86_shift_reg_imm(fp, X86_SHL, X86_EBX, factor);
                } else {
                    x86_shift_reg_imm(fp, X86_SHR, X86_EBX, -factor);
                }
#else
                if (factor > 0) {
                    x86_shift_reg_imm(fp, X86_SHL, X86_ECX, factor);
                } else {
                    x86_shift_reg_imm(fp, X86_SHR, X86_ECX, -factor);
                }
#endif
            }
        }

        ws_is = 8 * p->ws_is[ffts_ctzl(pps[0] / leaf_N) - 1];
        if (ws_is != pLUT) {
            int offset = (int) (ws_is - pLUT);

#ifdef _M_X64
            x64_alu_reg_imm_size(fp, X86_ADD, X64_R9, offset, 8);
#else
            x64_alu_reg_imm_size(fp, X86_ADD, X64_R8, offset, 8);
#endif
        }

        if (pps[0] == 2 * leaf_N) {
            x64_call_code(fp, x_4_addr);
        } else {
            x64_call_code(fp, x_8_addr);
        }

        pAddr = 4 * pps[1];
        if (pps[0] > leaf_N) {
            pN = pps[0];
        }

        pLUT = ws_is;//LUT_offset(pps[0], leafN);
        //fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
        count += 4;
        pps += 2;
    }
#endif

#ifdef __arm__
#ifdef HAVE_NEON
    if (ffts_ctzl(N) & 1) {
        ADDI(&fp, 2, 7, 0);
        ADDI(&fp, 7, 9, 0);
        ADDI(&fp, 9, 2, 0);

        ADDI(&fp, 2, 8, 0);
        ADDI(&fp, 8, 10, 0);
        ADDI(&fp, 10, 2, 0);

        if(p->i1) {
            MOVI(&fp, 11, p->i1);
            memcpy(fp, neon_oo, neon_eo - neon_oo);
            if(sign < 0) {
                fp[12] ^= 0x00200000;
                fp[13] ^= 0x00200000;
                fp[14] ^= 0x00200000;
                fp[15] ^= 0x00200000;
                fp[27] ^= 0x00200000;
                fp[29] ^= 0x00200000;
                fp[30] ^= 0x00200000;
                fp[31] ^= 0x00200000;
                fp[46] ^= 0x00200000;
                fp[47] ^= 0x00200000;
                fp[48] ^= 0x00200000;
                fp[57] ^= 0x00200000;
            }
            fp += (neon_eo - neon_oo) / 4;
        }

        *fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p));
        fp++;

        memcpy(fp, neon_oe, neon_end - neon_oe);
        if(sign < 0) {
            fp[19] ^= 0x00200000;
            fp[20] ^= 0x00200000;
            fp[22] ^= 0x00200000;
            fp[23] ^= 0x00200000;
            fp[37] ^= 0x00200000;
            fp[38] ^= 0x00200000;
            fp[40] ^= 0x00200000;
            fp[41] ^= 0x00200000;
            fp[64] ^= 0x00200000;
            fp[65] ^= 0x00200000;
            fp[66] ^= 0x00200000;
            fp[67] ^= 0x00200000;
        }
        fp += (neon_end - neon_oe) / 4;

    } else {

        *fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p));
        fp++;

        memcpy(fp, neon_eo, neon_oe - neon_eo);
        if(sign < 0) {
            fp[10] ^= 0x00200000;
            fp[11] ^= 0x00200000;
            fp[13] ^= 0x00200000;
            fp[14] ^= 0x00200000;
            fp[31] ^= 0x00200000;
            fp[33] ^= 0x00200000;
            fp[34] ^= 0x00200000;
            fp[35] ^= 0x00200000;
            fp[59] ^= 0x00200000;
            fp[60] ^= 0x00200000;
            fp[61] ^= 0x00200000;
            fp[62] ^= 0x00200000;
        }
        fp += (neon_oe - neon_eo) / 4;

        ADDI(&fp, 2, 7, 0);
        ADDI(&fp, 7, 9, 0);
        ADDI(&fp, 9, 2, 0);

        ADDI(&fp, 2, 8, 0);
        ADDI(&fp, 8, 10, 0);
        ADDI(&fp, 10, 2, 0);

        if(p->i1) {
            MOVI(&fp, 11, p->i1);
            memcpy(fp, neon_oo, neon_eo - neon_oo);
            if(sign < 0) {
                fp[12] ^= 0x00200000;
                fp[13] ^= 0x00200000;
                fp[14] ^= 0x00200000;
                fp[15] ^= 0x00200000;
                fp[27] ^= 0x00200000;
                fp[29] ^= 0x00200000;
                fp[30] ^= 0x00200000;
                fp[31] ^= 0x00200000;
                fp[46] ^= 0x00200000;
                fp[47] ^= 0x00200000;
                fp[48] ^= 0x00200000;
                fp[57] ^= 0x00200000;
            }
            fp += (neon_eo - neon_oo) / 4;
        }
    }

    if(p->i1) {
        ADDI(&fp, 2, 3, 0);
        ADDI(&fp, 3, 7, 0);
        ADDI(&fp, 7, 2, 0);

        ADDI(&fp, 2, 4, 0);
        ADDI(&fp, 4, 8, 0);
        ADDI(&fp, 8, 2, 0);

        ADDI(&fp, 2, 5, 0);
        ADDI(&fp, 5, 9, 0);
        ADDI(&fp, 9, 2, 0);

        ADDI(&fp, 2, 6, 0);
        ADDI(&fp, 6, 10, 0);
        ADDI(&fp, 10, 2, 0);

        ADDI(&fp, 2, 9, 0);
        ADDI(&fp, 9, 10, 0);
        ADDI(&fp, 10, 2, 0);

        *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p));
        fp++;
        MOVI(&fp, 11, p->i1);
        memcpy(fp, neon_ee, neon_oo - neon_ee);
        if(sign < 0) {
            fp[33] ^= 0x00200000;
            fp[37] ^= 0x00200000;
            fp[38] ^= 0x00200000;
            fp[39] ^= 0x00200000;
            fp[40] ^= 0x00200000;
            fp[41] ^= 0x00200000;
            fp[44] ^= 0x00200000;
            fp[45] ^= 0x00200000;
            fp[46] ^= 0x00200000;
            fp[47] ^= 0x00200000;
            fp[48] ^= 0x00200000;
            fp[57] ^= 0x00200000;
        }
        fp += (neon_oo - neon_ee) / 4;
    }
#else
    ADDI(&fp, 2, 7, 0);
    ADDI(&fp, 7, 9, 0);
    ADDI(&fp, 9, 2, 0);

    ADDI(&fp, 2, 8, 0);
    ADDI(&fp, 8, 10, 0);
    ADDI(&fp, 10, 2, 0);

    MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1);
    memcpy(fp, vfp_o, vfp_x4 - vfp_o);
    if(sign > 0) {
        fp[22] ^= 0x00000040;
        fp[24] ^= 0x00000040;
        fp[25] ^= 0x00000040;
        fp[26] ^= 0x00000040;
        fp[62] ^= 0x00000040;
        fp[64] ^= 0x00000040;
        fp[65] ^= 0x00000040;
        fp[66] ^= 0x00000040;
    }
    fp += (vfp_x4 - vfp_o) / 4;

    ADDI(&fp, 2, 3, 0);
    ADDI(&fp, 3, 7, 0);
    ADDI(&fp, 7, 2, 0);

    ADDI(&fp, 2, 4, 0);
    ADDI(&fp, 4, 8, 0);
    ADDI(&fp, 8, 2, 0);

    ADDI(&fp, 2, 5, 0);
    ADDI(&fp, 5, 9, 0);
    ADDI(&fp, 9, 2, 0);

    ADDI(&fp, 2, 6, 0);
    ADDI(&fp, 6, 10, 0);
    ADDI(&fp, 10, 2, 0);

    ADDI(&fp, 2, 9, 0);
    ADDI(&fp, 9, 10, 0);
    ADDI(&fp, 10, 2, 0);

    *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p));
    fp++;
    MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1);
    memcpy(fp, vfp_e, vfp_o - vfp_e);
    if(sign > 0) {
        fp[64] ^= 0x00000040;
        fp[65] ^= 0x00000040;
        fp[68] ^= 0x00000040;
        fp[75] ^= 0x00000040;
        fp[76] ^= 0x00000040;
        fp[79] ^= 0x00000040;
        fp[80] ^= 0x00000040;
        fp[83] ^= 0x00000040;
        fp[84] ^= 0x00000040;
        fp[87] ^= 0x00000040;
        fp[91] ^= 0x00000040;
        fp[93] ^= 0x00000040;
    }
    fp += (vfp_o - vfp_e) / 4;

#endif
    *fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p));
    fp++; // load offsets into r12
    //ADDI(&fp, 2, 1, 0);
    MOVI(&fp, 1, 0);

    // args: r0 - out
    //       r1 - N
    //       r2 - ws
    //	ADDI(&fp, 3, 1, 0); // put N into r3 for counter

    count = 2;
    while(pps[0]) {

        //	fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr);
        if(!pN) {
            MOVI(&fp, 1, pps[0]);
        } else {
            if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr);
            if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN);
        }

        if (p->ws_is[ffts_ctzl(pps[0]/leaf_N)-1]*8 - pLUT) {
            ADDI(&fp, 2, 2, p->ws_is[ffts_ctzl(pps[0]/leaf_N)-1]*8 - pLUT);
        }

        if(pps[0] == 2 * leaf_N) {
            *fp = BL(fp+2, x_4_addr);
            fp++;
        } else if(!pps[2]) {
            //uint32_t *x_8_t_addr = fp;
#ifdef HAVE_NEON
            memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
            if(sign < 0) {
                fp[31] ^= 0x00200000;
                fp[32] ^= 0x00200000;
                fp[33] ^= 0x00200000;
                fp[34] ^= 0x00200000;
                fp[65] ^= 0x00200000;
                fp[66] ^= 0x00200000;
                fp[70] ^= 0x00200000;
                fp[74] ^= 0x00200000;
                fp[97] ^= 0x00200000;
                fp[98] ^= 0x00200000;
                fp[102] ^= 0x00200000;
                fp[104] ^= 0x00200000;
            }
            fp += (neon_ee - neon_x8_t) / 4;
            //*fp++ = BL(fp+2, x_8_t_addr);

#else
            *fp = BL(fp+2, x_8_addr);
            fp++;
#endif
        } else {
            *fp = BL(fp+2, x_8_addr);
            fp++;
        }

        pAddr = pps[1] * 4;
        pN = pps[0];
        pLUT = p->ws_is[ffts_ctzl(pps[0]/leaf_N)-1]*8;//LUT_offset(pps[0], leafN);
        //	fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
        count += 4;
        pps += 2;
    }

    *fp++ = 0xecbd8b10;
    *fp++ = POP_LR();
    count++;
#else
    generate_epilogue(&fp);
#endif

    //	*fp++ = B(14); count++;

    //for(int i=0;i<(neon_x8 - neon_x4)/4;i++)
    //	fprintf(stderr, "%08x\n", x_4_addr[i]);
    //fprintf(stderr, "\n");
    //for(int i=0;i<count;i++)

    //fprintf(stderr, "size of transform %u = %d\n", N, (fp - x_8_addr) * sizeof(*fp));

    free(ps);

#if defined(_MSC_VER)
#pragma warning(push)

    /* disable type cast warning from data pointer to function pointer */
#pragma warning(disable : 4055)
#endif

    return (transform_func_t) start;

#if defined(_MSC_VER)
#pragma warning(pop)
#endif
}