Exemplo n.º 1
0
CMemoryFunction::CMemoryFunction(const void* code, size_t size)
: m_code(nullptr)
{
#ifdef WIN32
	m_size = size;
	m_code = malloc(size);
	memcpy(m_code, code, size);
	
	DWORD oldProtect = 0;
	BOOL result = VirtualProtect(m_code, size, PAGE_EXECUTE_READWRITE, &oldProtect);
	assert(result == TRUE);
#elif defined(__APPLE__)
	vm_size_t page_size = 0;
	host_page_size(mach_task_self(), &page_size);
	unsigned int allocSize = ((size + page_size - 1) / page_size) * page_size;
	vm_allocate(mach_task_self(), reinterpret_cast<vm_address_t*>(&m_code), allocSize, TRUE); 
	memcpy(m_code, code, size);
	sys_icache_invalidate(m_code, size);
	kern_return_t result = vm_protect(mach_task_self(), reinterpret_cast<vm_address_t>(m_code), size, 0, VM_PROT_READ | VM_PROT_EXECUTE);
	assert(result == 0);
	m_size = allocSize;
#elif defined(__ANDROID__) || defined(__linux__) || defined(__FreeBSD__)
	m_size = size;
	m_code = mmap(nullptr, size, PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
	assert(m_code != MAP_FAILED);
	memcpy(m_code, code, size);
#if defined(__arm__) || defined(__aarch64__)
	__clear_cache(m_code, reinterpret_cast<uint8*>(m_code) + size);
#endif
#endif
}
Exemplo n.º 2
0
static inline void
ffi_clear_cache (void *start, void *end)
{
#if defined (__clang__) && defined (__APPLE__)
	sys_icache_invalidate (start, (char *)end - (char *)start);
#elif defined (__GNUC__)
	__builtin___clear_cache (start, end);
#else
#error "Missing builtin to flush instruction cache"
#endif
}
Exemplo n.º 3
0
void __clear_cache(void* start, void* end)
{
#if __i386__ || __x86_64__
/*
 * Intel processors have a unified instruction and data cache
 * so there is nothing to do
 */
#else
    #if __APPLE__
        /* On Darwin, sys_icache_invalidate() provides this functionality */
        sys_icache_invalidate(start, end-start);
    #else
        compilerrt_abort();
    #endif
#endif
}
Exemplo n.º 4
0
void cache_flush_d_inval_i(void *start, void *end)
{
#ifdef __arm__
#if defined(__BLACKBERRY_QNX__)
   msync(start, end - start, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE);
#elif defined(__MACH__)
   size_t len = (char *)end - (char *)start;
   sys_dcache_flush(start, len);
   sys_icache_invalidate(start, len);
#elif defined(_3DS)
   ctr_flush_invalidate_cache();
#else
   __clear_cache(start, end);
#endif
#endif
}
Exemplo n.º 5
0
/* Synchronize data/instruction cache. */
void lj_mcode_sync(void *start, void *end)
{
#ifdef LUAJIT_USE_VALGRIND
  VALGRIND_DISCARD_TRANSLATIONS(start, (char *)end-(char *)start);
#endif
#if LJ_TARGET_X86ORX64
  UNUSED(start); UNUSED(end);
#elif LJ_TARGET_IOS
  sys_icache_invalidate(start, (char *)end-(char *)start);
#elif LJ_TARGET_PPC
  lj_vm_cachesync(start, end);
#elif defined(__GNUC__)
  __clear_cache(start, end);
#else
#error "Missing builtin to flush instruction cache"
#endif
}
Exemplo n.º 6
0
/* On ARM and other platforms, we need to flush the cache after
   writing code into memory, so the processor reliably sees it. */
void flushExec (W_ len, AdjustorExecutable exec_addr)
{
#if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH)
  /* x86 doesn't need to do anything, so just suppress some warnings. */
  (void)len;
  (void)exec_addr;
#elif (defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH)) && defined(ios_HOST_OS)
  /* On iOS we need to use the special 'sys_icache_invalidate' call. */
  sys_icache_invalidate(exec_addr, ((unsigned char*)exec_addr)+len);
#elif defined(__GNUC__)
  /* For all other platforms, fall back to a libgcc builtin. */
  unsigned char* begin = (unsigned char*)exec_addr;
  unsigned char* end   = begin + len;
  __clear_cache((void*)begin, (void*)end);
#else
#error Missing support to flush the instruction cache
#endif
}
Exemplo n.º 7
0
int
sys_cache_control(int function, void *start, size_t len)
{
	int	status = 0;
	
	switch( function ) {
	
	case kCacheFunctionPrepareForExecution:
		sys_icache_invalidate(start, len);
		break;
		
	case kCacheFunctionFlushDcache:
		sys_dcache_flush(start, len);
		break;
		
	default:
		status = ENOTSUP;
	}
	
	return	status;
}
Exemplo n.º 8
0
int memsync(void *start, void *end)
{
   size_t len = (char*)end - (char*)start;
#if defined(__MACH__) && defined(__arm__)
   sys_dcache_flush(start ,len);
   sys_icache_invalidate(start, len);
   return 0;
#elif defined(__arm__) && !defined(__QNX__)
   (void)len;
   __clear_cache(start, end);
   return 0;
#elif defined(HAVE_MMAN)
   return msync(start, len, MS_SYNC | MS_INVALIDATE
#ifdef __QNX__
         MS_CACHE_ONLY
#endif
         );
#else
   (void)len;
   return 0;
#endif
}
Exemplo n.º 9
0
void __clear_cache(void* start, void* end)
{
#if __i386__ || __x86_64__
/*
 * Intel processors have a unified instruction and data cache
 * so there is nothing to do
 */
#elif defined(__NetBSD__) && defined(__arm__)
  struct arm_sync_icache_args arg;

  arg.addr = (uintptr_t)start;
  arg.len = (uintptr_t)end - (uintptr_t)start;

  sysarch(ARM_SYNC_ICACHE, &arg);
#else
    #if __APPLE__
        /* On Darwin, sys_icache_invalidate() provides this functionality */
        sys_icache_invalidate(start, end-start);
    #else
        compilerrt_abort();
    #endif
#endif
}
Exemplo n.º 10
0
static void __clear_cache(void *start, void *end) {
  size_t len = (char *)end - (char *)start;
  sys_dcache_flush(start, len);
  sys_icache_invalidate(start, len);
}
Exemplo n.º 11
0
void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
	int count = tree_count(N, leafN, 0) + 1;
	size_t *ps = malloc(count * 2 * sizeof(size_t));
	size_t *pps = ps;

#ifdef __x86_64__
	if(sign < 0) p->constants = sse_constants;
	else         p->constants = sse_constants_inv;
#endif

	elaborate_tree(&pps, N, leafN, 0);
	pps[0] = 0;
	pps[1] = 0;

	pps = ps;

#ifdef __arm__ 
	if(N < 8192) p->transform_size = 8192;
	else p->transform_size = N;
#else
	if(N < 2048) p->transform_size = 16384;
	else p->transform_size = 16384 + 2*N/8 * __builtin_ctzl(N);
#endif

#ifdef __APPLE__
	p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANON | MAP_SHARED, -1, 0);
#else
#define MAP_ANONYMOUS 0x20	
	p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
#endif

/*
	if(p->transform_base == MAP_FAILED) {
		fprintf(stderr, "MAP FAILED\n");
		exit(1);
	}*/
	insns_t *func = p->transform_base;//valloc(8192);
	insns_t *fp = func;

//fprintf(stderr, "Allocating %d bytes \n", p->transform_size);
//fprintf(stderr, "Base address = %016p\n", func);

	if(!func) { 
		fprintf(stderr, "NOMEM\n");
		exit(1);
	}

	insns_t *x_8_addr = fp;
#ifdef __arm__
#ifdef HAVE_NEON
	memcpy(fp, neon_x8, neon_x8_t - neon_x8);
	if(sign < 0) {
		fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
		fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000;
		fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000;
	}
	fp += (neon_x8_t - neon_x8) / 4;
#else
	memcpy(fp, vfp_x8, vfp_end - vfp_x8);
	if(sign > 0) {
		fp[65] ^= 0x00000040; 
		fp[66] ^= 0x00000040; 
		fp[68] ^= 0x00000040; 
		fp[70] ^= 0x00000040; 
  	fp[103] ^= 0x00000040; 
  	fp[104] ^= 0x00000040; 
  	fp[105] ^= 0x00000040; 
  	fp[108] ^= 0x00000040; 
  	fp[113] ^= 0x00000040; 
  	fp[114] ^= 0x00000040; 
  	fp[117] ^= 0x00000040; 
  	fp[118] ^= 0x00000040; 
	}
	fp += (vfp_end - vfp_x8) / 4;
#endif
#else
	align_mem16(&fp, 0);
	x_8_addr = fp;
	align_mem16(&fp, 5);
	memcpy(fp, x8_soft, x8_hard - x8_soft);
	fp += (x8_hard - x8_soft);
//fprintf(stderr, "X8 start address = %016p\n", x_8_addr);
#endif
//uint32_t *x_8_t_addr = fp;
//memcpy(fp, neon_x8_t, neon_end - neon_x8_t);
//fp += (neon_end - neon_x8_t) / 4;
	insns_t *x_4_addr = fp;
#ifdef __arm__

#ifdef HAVE_NEON
	memcpy(fp, neon_x4, neon_x8 - neon_x4);
	if(sign < 0) {
		fp[26] ^= 0x00200000; fp[28] ^= 0x00200000; fp[31] ^= 0x00200000; fp[32] ^= 0x00200000;
	}
	fp += (neon_x8 - neon_x4) / 4;
#else
	memcpy(fp, vfp_x4, vfp_x8 - vfp_x4);
	if(sign > 0) {
		fp[36] ^= 0x00000040; 
		fp[38] ^= 0x00000040; 
		fp[43] ^= 0x00000040; 
		fp[44] ^= 0x00000040;
	}
	fp += (vfp_x8 - vfp_x4) / 4;
#endif
#else
	align_mem16(&fp, 0);
	x_4_addr = fp;
	memcpy(fp, x4, x8_soft - x4);
	fp += (x8_soft - x4);

#endif
	insns_t *start = fp;

#ifdef __arm__ 
	*fp = PUSH_LR(); fp++;
	*fp = 0xed2d8b10; fp++;

	ADDI(&fp, 3, 1, 0);
	ADDI(&fp, 7, 1, N);
	ADDI(&fp, 5, 1, 2*N);
	ADDI(&fp, 10, 7, 2*N);
	ADDI(&fp, 4, 5, 2*N);
	ADDI(&fp, 8, 10, 2*N);
	ADDI(&fp, 6, 4, 2*N);
	ADDI(&fp, 9, 8, 2*N);
	
  *fp = LDRI(12, 0, ((uint32_t)&p->offsets) - ((uint32_t)p)); fp++; // load offsets into r12
//  *fp++ = LDRI(1, 0, 4); // load ws into r1
	ADDI(&fp, 1, 0, 0);

	ADDI(&fp, 0, 2, 0), // mov out into r0
#endif


#ifdef __arm__
		*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; 
  	#ifdef HAVE_NEON
  		MOVI(&fp, 11, p->i0);
  	#else 
  		MOVI(&fp, 11, p->i0);
  	#endif

#else
	align_mem16(&fp, 0);
	start = fp;
	
	*fp++ = 0x4c;
	*fp++ = 0x8b;
	*fp++ = 0x07;
	uint32_t lp_cnt = p->i0 * 4;
	MOVI(&fp, RCX, lp_cnt);
	
	//LEA(&fp, R8, RDI, ((uint32_t)&p->offsets) - ((uint32_t)p)); 
#endif
	//fp++;
#ifdef __arm__
#ifdef HAVE_NEON
	memcpy(fp, neon_ee, neon_oo - neon_ee);
	if(sign < 0) {
		fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
		fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
		fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
	}
	fp += (neon_oo - neon_ee) / 4;
#else
		memcpy(fp, vfp_e, vfp_o - vfp_e);
		if(sign > 0) {
			fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040;
			fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040;
			fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040;
		}
		fp += (vfp_o - vfp_e) / 4;
#endif
#else
//fprintf(stderr, "Body start address = %016p\n", start);
	
	PUSH(&fp, RBP);	
	PUSH(&fp, RBX);
	PUSH(&fp, R10);
	PUSH(&fp, R11);
	PUSH(&fp, R12);
	PUSH(&fp, R13);
	PUSH(&fp, R14);
	PUSH(&fp, R15);
	
	int i;
	memcpy(fp, leaf_ee_init, leaf_ee - leaf_ee_init);
	
//fprintf(stderr, "Leaf ee init address = %016p\n", leaf_ee_init);
//fprintf(stderr, "Constants address = %016p\n", sse_constants);
//fprintf(stderr, "Constants address = %016p\n", p->constants);
	
//int32_t val = READ_IMM32(fp + 3);
//fprintf(stderr, "diff = 0x%x\n", ((uint32_t)&p->constants) - ((uint32_t)p));

//int64_t v2 = val + (int64_t)((void *)leaf_ee_init - (void *)fp );
//fprintf(stderr, "IMM = 0x%llx\n", v2);

//IMM32_NI(fp + 3, ((int64_t) READ_IMM32(fp + 3)) + ((void *)leaf_ee_init - (void *)fp )); 
	fp += (leaf_ee - leaf_ee_init);

//fprintf(stderr, "Leaf start address = %016p\n", fp);
	align_mem16(&fp, 9);
	memcpy(fp, leaf_ee, leaf_oo - leaf_ee);


	uint32_t offsets[8] = {0, N, N/2, 3*N/2, N/4, 5*N/4, 7*N/4, 3*N/4};
	uint32_t offsets_o[8] = {0, N, N/2, 3*N/2, 7*N/4, 3*N/4, N/4, 5*N/4};
	uint32_t offsets_oe[8] = {7*N/4, 3*N/4, N/4, 5*N/4, 0, N, 3*N/2, N/2};
	
	for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets[i]*4); 
	
	fp += (leaf_oo - leaf_ee);
	
	if(__builtin_ctzl(N) & 1){
		if(p->i1) {
			lp_cnt += p->i1 * 4;
  		MOVI(&fp, RCX, lp_cnt);
			align_mem16(&fp, 4);
  		memcpy(fp, leaf_oo, leaf_eo - leaf_oo);
  		for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4); 
  		fp += (leaf_eo - leaf_oo);
		}
		

  	memcpy(fp, leaf_oe, leaf_end - leaf_oe);
  	lp_cnt += 4;
  	for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oe_offsets[i], offsets_o[i]*4); 
  	fp += (leaf_end - leaf_oe);

	}else{


		memcpy(fp, leaf_eo, leaf_oe - leaf_eo);
		lp_cnt += 4;
		for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_eo_offsets[i], offsets[i]*4); 
		fp += (leaf_oe - leaf_eo);

  	if(p->i1) {
			lp_cnt += p->i1 * 4;
  		MOVI(&fp, RCX, lp_cnt);
			align_mem16(&fp, 4);
  		memcpy(fp, leaf_oo, leaf_eo - leaf_oo);
  		for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4); 
  		fp += (leaf_eo - leaf_oo);
  	}

	}
	if(p->i1) {

		lp_cnt += p->i1 * 4;
		MOVI(&fp, RCX, lp_cnt);
		align_mem16(&fp, 9);
		memcpy(fp, leaf_ee, leaf_oo - leaf_ee);
		for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets_oe[i]*4); 
		fp += (leaf_oo - leaf_ee);

	}
	
//fprintf(stderr, "Body start address = %016p\n", fp);
  //LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p)); 
	memcpy(fp, x_init, x4 - x_init);
//IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp )); 
	fp += (x4 - x_init);

	int32_t pAddr = 0;
	int32_t pN = 0;
	int32_t pLUT = 0;
	count = 2;
	while(pps[0]) {
	
		if(!pN) {
			MOVI(&fp, RCX, pps[0] / 4);
		}else{
  		if((pps[1]*4)-pAddr) ADDI(&fp, RDX, (pps[1] * 4)- pAddr);
			if(pps[0] > leafN && pps[0] - pN) {
				
				int diff = __builtin_ctzl(pps[0]) - __builtin_ctzl(pN);
				*fp++ = 0xc1; 
				
				if(diff > 0) {
					*fp++ = 0xe1;
					*fp++ = (diff & 0xff);
				}else{
					*fp++ = 0xe9;
					*fp++ = ((-diff) & 0xff);
				}
			}
		}
		
  		if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
  			ADDI(&fp, R8, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT); 


  	if(pps[0] == 2*leafN) {
      CALL(&fp, x_4_addr);
//  	}else if(!pps[2]){
//	  //uint32_t *x_8_t_addr = fp;
//		memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
//		fp += (neon_ee - neon_x8_t) / 4;
//		//*fp++ = BL(fp+2, x_8_t_addr);
  	}else{
    		CALL(&fp, x_8_addr);
  	}

		pAddr = pps[1] * 4;
		if(pps[0] > leafN) 
			pN = pps[0];
		pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
//	fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); 
		count += 4;
		pps += 2;
	}
#endif
#ifdef __arm__
#ifdef HAVE_NEON
	if(__builtin_ctzl(N) & 1){
		ADDI(&fp, 2, 7, 0);
		ADDI(&fp, 7, 9, 0);
		ADDI(&fp, 9, 2, 0);

		ADDI(&fp, 2, 8, 0);
		ADDI(&fp, 8, 10, 0);
		ADDI(&fp, 10, 2, 0);
	
		if(p->i1) {
			MOVI(&fp, 11, p->i1);
			memcpy(fp, neon_oo, neon_eo - neon_oo);
			if(sign < 0) {
				fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000;
				fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000;
				fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
			}
			fp += (neon_eo - neon_oo) / 4;
		}
		
		*fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p)); fp++; 

		memcpy(fp, neon_oe, neon_end - neon_oe);
		if(sign < 0) {
			fp[19] ^= 0x00200000; fp[20] ^= 0x00200000; fp[22] ^= 0x00200000; fp[23] ^= 0x00200000;
			fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[40] ^= 0x00200000; fp[41] ^= 0x00200000;
			fp[64] ^= 0x00200000; fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[67] ^= 0x00200000;
		}
		fp += (neon_end - neon_oe) / 4;

	}else{
		
		*fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p)); fp++; 

		memcpy(fp, neon_eo, neon_oe - neon_eo);
		if(sign < 0) {
			fp[10] ^= 0x00200000; fp[11] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000;
			fp[31] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; fp[35] ^= 0x00200000;
			fp[59] ^= 0x00200000; fp[60] ^= 0x00200000; fp[61] ^= 0x00200000; fp[62] ^= 0x00200000;
		}
		fp += (neon_oe - neon_eo) / 4;
		
		ADDI(&fp, 2, 7, 0);
		ADDI(&fp, 7, 9, 0);
		ADDI(&fp, 9, 2, 0);

		ADDI(&fp, 2, 8, 0);
		ADDI(&fp, 8, 10, 0);
		ADDI(&fp, 10, 2, 0);
	
		if(p->i1) {
			MOVI(&fp, 11, p->i1);
			memcpy(fp, neon_oo, neon_eo - neon_oo);
			if(sign < 0) {
				fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000;
				fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000;
				fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
			}
			fp += (neon_eo - neon_oo) / 4;
		}

	}


	if(p->i1) {
		ADDI(&fp, 2, 3, 0);
		ADDI(&fp, 3, 7, 0);
		ADDI(&fp, 7, 2, 0);

		ADDI(&fp, 2, 4, 0);
		ADDI(&fp, 4, 8, 0);
		ADDI(&fp, 8, 2, 0);
		
		ADDI(&fp, 2, 5, 0);
		ADDI(&fp, 5, 9, 0);
		ADDI(&fp, 9, 2, 0);

		ADDI(&fp, 2, 6, 0);
		ADDI(&fp, 6, 10, 0);
		ADDI(&fp, 10, 2, 0);

		ADDI(&fp, 2, 9, 0);
		ADDI(&fp, 9, 10, 0);
		ADDI(&fp, 10, 2, 0);

		*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; 
	  MOVI(&fp, 11, p->i1);
  	memcpy(fp, neon_ee, neon_oo - neon_ee);
  	if(sign < 0) {
  		fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
  		fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
  		fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
  	}
		fp += (neon_oo - neon_ee) / 4;

	}
#else
		ADDI(&fp, 2, 7, 0);
		ADDI(&fp, 7, 9, 0);
		ADDI(&fp, 9, 2, 0);

		ADDI(&fp, 2, 8, 0);
		ADDI(&fp, 8, 10, 0);
		ADDI(&fp, 10, 2, 0);
	
			MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1);
  		memcpy(fp, vfp_o, vfp_x4 - vfp_o);
		if(sign > 0) {
			fp[22] ^= 0x00000040; fp[24] ^= 0x00000040; fp[25] ^= 0x00000040; fp[26] ^= 0x00000040;
			fp[62] ^= 0x00000040; fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[66] ^= 0x00000040;
		}
  		fp += (vfp_x4 - vfp_o) / 4;
		
		ADDI(&fp, 2, 3, 0);
		ADDI(&fp, 3, 7, 0);
		ADDI(&fp, 7, 2, 0);

		ADDI(&fp, 2, 4, 0);
		ADDI(&fp, 4, 8, 0);
		ADDI(&fp, 8, 2, 0);
		
		ADDI(&fp, 2, 5, 0);
		ADDI(&fp, 5, 9, 0);
		ADDI(&fp, 9, 2, 0);

		ADDI(&fp, 2, 6, 0);
		ADDI(&fp, 6, 10, 0);
		ADDI(&fp, 10, 2, 0);

		ADDI(&fp, 2, 9, 0);
		ADDI(&fp, 9, 10, 0);
		ADDI(&fp, 10, 2, 0);

		*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; 
	  MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1);
  	memcpy(fp, vfp_e, vfp_o - vfp_e);
		if(sign > 0) {
			fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040;
			fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040;
			fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040;
		}
  	fp += (vfp_o - vfp_e) / 4;

#endif
  *fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); fp++; // load offsets into r12
	//ADDI(&fp, 2, 1, 0);
	MOVI(&fp, 1, 0);
	
	// args: r0 - out
	//       r1 - N
	//       r2 - ws
//	ADDI(&fp, 3, 1, 0); // put N into r3 for counter

	int32_t pAddr = 0;
	int32_t pN = 0;
	int32_t pLUT = 0;
	count = 2;
	while(pps[0]) {
	
//	fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr);
		if(!pN) {
			MOVI(&fp, 1, pps[0]);
		}else{
  		if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr);
			if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN);
		}
		
		if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
			ADDI(&fp, 2, 2, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT); 


  	if(pps[0] == 2*leafN) {
        *fp = BL(fp+2, x_4_addr); fp++;
  	}else if(!pps[2]){
  	  //uint32_t *x_8_t_addr = fp;
#ifdef HAVE_NEON
  		memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
  		if(sign < 0) {
  			fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
  			fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000;
   			fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000;
  		}
  		fp += (neon_ee - neon_x8_t) / 4;
  		//*fp++ = BL(fp+2, x_8_t_addr);

#else
    		*fp = BL(fp+2, x_8_addr); fp++;
#endif
  	}else{
    		*fp = BL(fp+2, x_8_addr); fp++;
  	}

		pAddr = pps[1] * 4;
		pN = pps[0];
		pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
//	fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); 
		count += 4;
		pps += 2;
	}
	
	*fp++ = 0xecbd8b10;
	*fp++ = POP_LR(); count++;
#else
	POP(&fp, R15);
	POP(&fp, R14);
	POP(&fp, R13);
	POP(&fp, R12);
	POP(&fp, R11);
	POP(&fp, R10);
	POP(&fp, RBX);
	POP(&fp, RBP);
	RET(&fp);	


//uint8_t *pp = func;
//int counter = 0;
//do{ 
//	printf("%02x ", *pp);
//	if(counter++ % 16 == 15) printf("\n");
//} while(++pp < fp);

//printf("\n");


#endif


//	*fp++ = B(14); count++;

//for(int i=0;i<(neon_x8 - neon_x4)/4;i++) 
//	fprintf(stderr, "%08x\n", x_4_addr[i]);
//fprintf(stderr, "\n");
//for(int i=0;i<count;i++) 

	free(ps);
	
  if (mprotect(func, p->transform_size, PROT_READ | PROT_EXEC)) {
  	perror("Couldn't mprotect");
  	exit(1);
  }
#ifdef __APPLE__
	sys_icache_invalidate(func, p->transform_size);
#elif __ANDROID__
	cacheflush((long)(func), (long)(func) + p->transform_size, 0);
#elif __linux__
#ifdef __GNUC__
	__clear_cache((long)(func), (long)(func) + p->transform_size);
#endif
#endif

//fprintf(stderr, "size of transform %zu = %d\n", N, (fp-func)*4);

	p->transform = (void *) (start);
}
void clear_insn_cache(u32 start, u32 end, int type)
{
	sys_icache_invalidate((void*)start, end - start);
}