CMemoryFunction::CMemoryFunction(const void* code, size_t size) : m_code(nullptr) { #ifdef WIN32 m_size = size; m_code = malloc(size); memcpy(m_code, code, size); DWORD oldProtect = 0; BOOL result = VirtualProtect(m_code, size, PAGE_EXECUTE_READWRITE, &oldProtect); assert(result == TRUE); #elif defined(__APPLE__) vm_size_t page_size = 0; host_page_size(mach_task_self(), &page_size); unsigned int allocSize = ((size + page_size - 1) / page_size) * page_size; vm_allocate(mach_task_self(), reinterpret_cast<vm_address_t*>(&m_code), allocSize, TRUE); memcpy(m_code, code, size); sys_icache_invalidate(m_code, size); kern_return_t result = vm_protect(mach_task_self(), reinterpret_cast<vm_address_t>(m_code), size, 0, VM_PROT_READ | VM_PROT_EXECUTE); assert(result == 0); m_size = allocSize; #elif defined(__ANDROID__) || defined(__linux__) || defined(__FreeBSD__) m_size = size; m_code = mmap(nullptr, size, PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); assert(m_code != MAP_FAILED); memcpy(m_code, code, size); #if defined(__arm__) || defined(__aarch64__) __clear_cache(m_code, reinterpret_cast<uint8*>(m_code) + size); #endif #endif }
static inline void ffi_clear_cache (void *start, void *end) { #if defined (__clang__) && defined (__APPLE__) sys_icache_invalidate (start, (char *)end - (char *)start); #elif defined (__GNUC__) __builtin___clear_cache (start, end); #else #error "Missing builtin to flush instruction cache" #endif }
void __clear_cache(void* start, void* end) { #if __i386__ || __x86_64__ /* * Intel processors have a unified instruction and data cache * so there is nothing to do */ #else #if __APPLE__ /* On Darwin, sys_icache_invalidate() provides this functionality */ sys_icache_invalidate(start, end-start); #else compilerrt_abort(); #endif #endif }
void cache_flush_d_inval_i(void *start, void *end) { #ifdef __arm__ #if defined(__BLACKBERRY_QNX__) msync(start, end - start, MS_SYNC | MS_CACHE_ONLY | MS_INVALIDATE_ICACHE); #elif defined(__MACH__) size_t len = (char *)end - (char *)start; sys_dcache_flush(start, len); sys_icache_invalidate(start, len); #elif defined(_3DS) ctr_flush_invalidate_cache(); #else __clear_cache(start, end); #endif #endif }
/* Synchronize data/instruction cache. */ void lj_mcode_sync(void *start, void *end) { #ifdef LUAJIT_USE_VALGRIND VALGRIND_DISCARD_TRANSLATIONS(start, (char *)end-(char *)start); #endif #if LJ_TARGET_X86ORX64 UNUSED(start); UNUSED(end); #elif LJ_TARGET_IOS sys_icache_invalidate(start, (char *)end-(char *)start); #elif LJ_TARGET_PPC lj_vm_cachesync(start, end); #elif defined(__GNUC__) __clear_cache(start, end); #else #error "Missing builtin to flush instruction cache" #endif }
/* On ARM and other platforms, we need to flush the cache after writing code into memory, so the processor reliably sees it. */ void flushExec (W_ len, AdjustorExecutable exec_addr) { #if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH) /* x86 doesn't need to do anything, so just suppress some warnings. */ (void)len; (void)exec_addr; #elif (defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH)) && defined(ios_HOST_OS) /* On iOS we need to use the special 'sys_icache_invalidate' call. */ sys_icache_invalidate(exec_addr, ((unsigned char*)exec_addr)+len); #elif defined(__GNUC__) /* For all other platforms, fall back to a libgcc builtin. */ unsigned char* begin = (unsigned char*)exec_addr; unsigned char* end = begin + len; __clear_cache((void*)begin, (void*)end); #else #error Missing support to flush the instruction cache #endif }
int sys_cache_control(int function, void *start, size_t len) { int status = 0; switch( function ) { case kCacheFunctionPrepareForExecution: sys_icache_invalidate(start, len); break; case kCacheFunctionFlushDcache: sys_dcache_flush(start, len); break; default: status = ENOTSUP; } return status; }
int memsync(void *start, void *end) { size_t len = (char*)end - (char*)start; #if defined(__MACH__) && defined(__arm__) sys_dcache_flush(start ,len); sys_icache_invalidate(start, len); return 0; #elif defined(__arm__) && !defined(__QNX__) (void)len; __clear_cache(start, end); return 0; #elif defined(HAVE_MMAN) return msync(start, len, MS_SYNC | MS_INVALIDATE #ifdef __QNX__ MS_CACHE_ONLY #endif ); #else (void)len; return 0; #endif }
void __clear_cache(void* start, void* end) { #if __i386__ || __x86_64__ /* * Intel processors have a unified instruction and data cache * so there is nothing to do */ #elif defined(__NetBSD__) && defined(__arm__) struct arm_sync_icache_args arg; arg.addr = (uintptr_t)start; arg.len = (uintptr_t)end - (uintptr_t)start; sysarch(ARM_SYNC_ICACHE, &arg); #else #if __APPLE__ /* On Darwin, sys_icache_invalidate() provides this functionality */ sys_icache_invalidate(start, end-start); #else compilerrt_abort(); #endif #endif }
static void __clear_cache(void *start, void *end) { size_t len = (char *)end - (char *)start; sys_dcache_flush(start, len); sys_icache_invalidate(start, len); }
void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) { int count = tree_count(N, leafN, 0) + 1; size_t *ps = malloc(count * 2 * sizeof(size_t)); size_t *pps = ps; #ifdef __x86_64__ if(sign < 0) p->constants = sse_constants; else p->constants = sse_constants_inv; #endif elaborate_tree(&pps, N, leafN, 0); pps[0] = 0; pps[1] = 0; pps = ps; #ifdef __arm__ if(N < 8192) p->transform_size = 8192; else p->transform_size = N; #else if(N < 2048) p->transform_size = 16384; else p->transform_size = 16384 + 2*N/8 * __builtin_ctzl(N); #endif #ifdef __APPLE__ p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANON | MAP_SHARED, -1, 0); #else #define MAP_ANONYMOUS 0x20 p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_SHARED, -1, 0); #endif /* if(p->transform_base == MAP_FAILED) { fprintf(stderr, "MAP FAILED\n"); exit(1); }*/ insns_t *func = p->transform_base;//valloc(8192); insns_t *fp = func; //fprintf(stderr, "Allocating %d bytes \n", p->transform_size); //fprintf(stderr, "Base address = %016p\n", func); if(!func) { fprintf(stderr, "NOMEM\n"); exit(1); } insns_t *x_8_addr = fp; #ifdef __arm__ #ifdef HAVE_NEON memcpy(fp, neon_x8, neon_x8_t - neon_x8); if(sign < 0) { fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000; fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000; } fp += (neon_x8_t - neon_x8) / 4; #else memcpy(fp, vfp_x8, vfp_end - vfp_x8); if(sign > 0) { fp[65] ^= 0x00000040; fp[66] ^= 0x00000040; fp[68] ^= 0x00000040; fp[70] ^= 0x00000040; fp[103] ^= 0x00000040; fp[104] ^= 0x00000040; fp[105] ^= 0x00000040; fp[108] ^= 0x00000040; fp[113] ^= 0x00000040; fp[114] ^= 0x00000040; fp[117] ^= 0x00000040; fp[118] ^= 0x00000040; } fp += (vfp_end - vfp_x8) / 4; #endif #else align_mem16(&fp, 0); x_8_addr = fp; align_mem16(&fp, 5); memcpy(fp, x8_soft, x8_hard - x8_soft); fp += (x8_hard - x8_soft); //fprintf(stderr, "X8 start address = %016p\n", x_8_addr); #endif //uint32_t *x_8_t_addr = fp; //memcpy(fp, neon_x8_t, neon_end - neon_x8_t); //fp += (neon_end - neon_x8_t) / 4; insns_t *x_4_addr = fp; #ifdef __arm__ #ifdef HAVE_NEON memcpy(fp, neon_x4, neon_x8 - neon_x4); if(sign < 0) { fp[26] ^= 0x00200000; fp[28] ^= 0x00200000; fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; } fp += (neon_x8 - neon_x4) / 4; #else memcpy(fp, vfp_x4, vfp_x8 - vfp_x4); if(sign > 0) { fp[36] ^= 0x00000040; fp[38] ^= 0x00000040; fp[43] ^= 0x00000040; fp[44] ^= 0x00000040; } fp += (vfp_x8 - vfp_x4) / 4; #endif #else align_mem16(&fp, 0); x_4_addr = fp; memcpy(fp, x4, x8_soft - x4); fp += (x8_soft - x4); #endif insns_t *start = fp; #ifdef __arm__ *fp = PUSH_LR(); fp++; *fp = 0xed2d8b10; fp++; ADDI(&fp, 3, 1, 0); ADDI(&fp, 7, 1, N); ADDI(&fp, 5, 1, 2*N); ADDI(&fp, 10, 7, 2*N); ADDI(&fp, 4, 5, 2*N); ADDI(&fp, 8, 10, 2*N); ADDI(&fp, 6, 4, 2*N); ADDI(&fp, 9, 8, 2*N); *fp = LDRI(12, 0, ((uint32_t)&p->offsets) - ((uint32_t)p)); fp++; // load offsets into r12 // *fp++ = LDRI(1, 0, 4); // load ws into r1 ADDI(&fp, 1, 0, 0); ADDI(&fp, 0, 2, 0), // mov out into r0 #endif #ifdef __arm__ *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; #ifdef HAVE_NEON MOVI(&fp, 11, p->i0); #else MOVI(&fp, 11, p->i0); #endif #else align_mem16(&fp, 0); start = fp; *fp++ = 0x4c; *fp++ = 0x8b; *fp++ = 0x07; uint32_t lp_cnt = p->i0 * 4; MOVI(&fp, RCX, lp_cnt); //LEA(&fp, R8, RDI, ((uint32_t)&p->offsets) - ((uint32_t)p)); #endif //fp++; #ifdef __arm__ #ifdef HAVE_NEON memcpy(fp, neon_ee, neon_oo - neon_ee); if(sign < 0) { fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000; fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000; fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; } fp += (neon_oo - neon_ee) / 4; #else memcpy(fp, vfp_e, vfp_o - vfp_e); if(sign > 0) { fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040; fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040; fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040; } fp += (vfp_o - vfp_e) / 4; #endif #else //fprintf(stderr, "Body start address = %016p\n", start); PUSH(&fp, RBP); PUSH(&fp, RBX); PUSH(&fp, R10); PUSH(&fp, R11); PUSH(&fp, R12); PUSH(&fp, R13); PUSH(&fp, R14); PUSH(&fp, R15); int i; memcpy(fp, leaf_ee_init, leaf_ee - leaf_ee_init); //fprintf(stderr, "Leaf ee init address = %016p\n", leaf_ee_init); //fprintf(stderr, "Constants address = %016p\n", sse_constants); //fprintf(stderr, "Constants address = %016p\n", p->constants); //int32_t val = READ_IMM32(fp + 3); //fprintf(stderr, "diff = 0x%x\n", ((uint32_t)&p->constants) - ((uint32_t)p)); //int64_t v2 = val + (int64_t)((void *)leaf_ee_init - (void *)fp ); //fprintf(stderr, "IMM = 0x%llx\n", v2); //IMM32_NI(fp + 3, ((int64_t) READ_IMM32(fp + 3)) + ((void *)leaf_ee_init - (void *)fp )); fp += (leaf_ee - leaf_ee_init); //fprintf(stderr, "Leaf start address = %016p\n", fp); align_mem16(&fp, 9); memcpy(fp, leaf_ee, leaf_oo - leaf_ee); uint32_t offsets[8] = {0, N, N/2, 3*N/2, N/4, 5*N/4, 7*N/4, 3*N/4}; uint32_t offsets_o[8] = {0, N, N/2, 3*N/2, 7*N/4, 3*N/4, N/4, 5*N/4}; uint32_t offsets_oe[8] = {7*N/4, 3*N/4, N/4, 5*N/4, 0, N, 3*N/2, N/2}; for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets[i]*4); fp += (leaf_oo - leaf_ee); if(__builtin_ctzl(N) & 1){ if(p->i1) { lp_cnt += p->i1 * 4; MOVI(&fp, RCX, lp_cnt); align_mem16(&fp, 4); memcpy(fp, leaf_oo, leaf_eo - leaf_oo); for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4); fp += (leaf_eo - leaf_oo); } memcpy(fp, leaf_oe, leaf_end - leaf_oe); lp_cnt += 4; for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oe_offsets[i], offsets_o[i]*4); fp += (leaf_end - leaf_oe); }else{ memcpy(fp, leaf_eo, leaf_oe - leaf_eo); lp_cnt += 4; for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_eo_offsets[i], offsets[i]*4); fp += (leaf_oe - leaf_eo); if(p->i1) { lp_cnt += p->i1 * 4; MOVI(&fp, RCX, lp_cnt); align_mem16(&fp, 4); memcpy(fp, leaf_oo, leaf_eo - leaf_oo); for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4); fp += (leaf_eo - leaf_oo); } } if(p->i1) { lp_cnt += p->i1 * 4; MOVI(&fp, RCX, lp_cnt); align_mem16(&fp, 9); memcpy(fp, leaf_ee, leaf_oo - leaf_ee); for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets_oe[i]*4); fp += (leaf_oo - leaf_ee); } //fprintf(stderr, "Body start address = %016p\n", fp); //LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p)); memcpy(fp, x_init, x4 - x_init); //IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp )); fp += (x4 - x_init); int32_t pAddr = 0; int32_t pN = 0; int32_t pLUT = 0; count = 2; while(pps[0]) { if(!pN) { MOVI(&fp, RCX, pps[0] / 4); }else{ if((pps[1]*4)-pAddr) ADDI(&fp, RDX, (pps[1] * 4)- pAddr); if(pps[0] > leafN && pps[0] - pN) { int diff = __builtin_ctzl(pps[0]) - __builtin_ctzl(pN); *fp++ = 0xc1; if(diff > 0) { *fp++ = 0xe1; *fp++ = (diff & 0xff); }else{ *fp++ = 0xe9; *fp++ = ((-diff) & 0xff); } } } if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT) ADDI(&fp, R8, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT); if(pps[0] == 2*leafN) { CALL(&fp, x_4_addr); // }else if(!pps[2]){ // //uint32_t *x_8_t_addr = fp; // memcpy(fp, neon_x8_t, neon_ee - neon_x8_t); // fp += (neon_ee - neon_x8_t) / 4; // //*fp++ = BL(fp+2, x_8_t_addr); }else{ CALL(&fp, x_8_addr); } pAddr = pps[1] * 4; if(pps[0] > leafN) pN = pps[0]; pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN); // fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); count += 4; pps += 2; } #endif #ifdef __arm__ #ifdef HAVE_NEON if(__builtin_ctzl(N) & 1){ ADDI(&fp, 2, 7, 0); ADDI(&fp, 7, 9, 0); ADDI(&fp, 9, 2, 0); ADDI(&fp, 2, 8, 0); ADDI(&fp, 8, 10, 0); ADDI(&fp, 10, 2, 0); if(p->i1) { MOVI(&fp, 11, p->i1); memcpy(fp, neon_oo, neon_eo - neon_oo); if(sign < 0) { fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000; fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000; fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; } fp += (neon_eo - neon_oo) / 4; } *fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p)); fp++; memcpy(fp, neon_oe, neon_end - neon_oe); if(sign < 0) { fp[19] ^= 0x00200000; fp[20] ^= 0x00200000; fp[22] ^= 0x00200000; fp[23] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[64] ^= 0x00200000; fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[67] ^= 0x00200000; } fp += (neon_end - neon_oe) / 4; }else{ *fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p)); fp++; memcpy(fp, neon_eo, neon_oe - neon_eo); if(sign < 0) { fp[10] ^= 0x00200000; fp[11] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[31] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; fp[35] ^= 0x00200000; fp[59] ^= 0x00200000; fp[60] ^= 0x00200000; fp[61] ^= 0x00200000; fp[62] ^= 0x00200000; } fp += (neon_oe - neon_eo) / 4; ADDI(&fp, 2, 7, 0); ADDI(&fp, 7, 9, 0); ADDI(&fp, 9, 2, 0); ADDI(&fp, 2, 8, 0); ADDI(&fp, 8, 10, 0); ADDI(&fp, 10, 2, 0); if(p->i1) { MOVI(&fp, 11, p->i1); memcpy(fp, neon_oo, neon_eo - neon_oo); if(sign < 0) { fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000; fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000; fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; } fp += (neon_eo - neon_oo) / 4; } } if(p->i1) { ADDI(&fp, 2, 3, 0); ADDI(&fp, 3, 7, 0); ADDI(&fp, 7, 2, 0); ADDI(&fp, 2, 4, 0); ADDI(&fp, 4, 8, 0); ADDI(&fp, 8, 2, 0); ADDI(&fp, 2, 5, 0); ADDI(&fp, 5, 9, 0); ADDI(&fp, 9, 2, 0); ADDI(&fp, 2, 6, 0); ADDI(&fp, 6, 10, 0); ADDI(&fp, 10, 2, 0); ADDI(&fp, 2, 9, 0); ADDI(&fp, 9, 10, 0); ADDI(&fp, 10, 2, 0); *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; MOVI(&fp, 11, p->i1); memcpy(fp, neon_ee, neon_oo - neon_ee); if(sign < 0) { fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000; fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000; fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000; } fp += (neon_oo - neon_ee) / 4; } #else ADDI(&fp, 2, 7, 0); ADDI(&fp, 7, 9, 0); ADDI(&fp, 9, 2, 0); ADDI(&fp, 2, 8, 0); ADDI(&fp, 8, 10, 0); ADDI(&fp, 10, 2, 0); MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1); memcpy(fp, vfp_o, vfp_x4 - vfp_o); if(sign > 0) { fp[22] ^= 0x00000040; fp[24] ^= 0x00000040; fp[25] ^= 0x00000040; fp[26] ^= 0x00000040; fp[62] ^= 0x00000040; fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[66] ^= 0x00000040; } fp += (vfp_x4 - vfp_o) / 4; ADDI(&fp, 2, 3, 0); ADDI(&fp, 3, 7, 0); ADDI(&fp, 7, 2, 0); ADDI(&fp, 2, 4, 0); ADDI(&fp, 4, 8, 0); ADDI(&fp, 8, 2, 0); ADDI(&fp, 2, 5, 0); ADDI(&fp, 5, 9, 0); ADDI(&fp, 9, 2, 0); ADDI(&fp, 2, 6, 0); ADDI(&fp, 6, 10, 0); ADDI(&fp, 10, 2, 0); ADDI(&fp, 2, 9, 0); ADDI(&fp, 9, 10, 0); ADDI(&fp, 10, 2, 0); *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++; MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1); memcpy(fp, vfp_e, vfp_o - vfp_e); if(sign > 0) { fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040; fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040; fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040; } fp += (vfp_o - vfp_e) / 4; #endif *fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); fp++; // load offsets into r12 //ADDI(&fp, 2, 1, 0); MOVI(&fp, 1, 0); // args: r0 - out // r1 - N // r2 - ws // ADDI(&fp, 3, 1, 0); // put N into r3 for counter int32_t pAddr = 0; int32_t pN = 0; int32_t pLUT = 0; count = 2; while(pps[0]) { // fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr); if(!pN) { MOVI(&fp, 1, pps[0]); }else{ if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr); if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN); } if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT) ADDI(&fp, 2, 2, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT); if(pps[0] == 2*leafN) { *fp = BL(fp+2, x_4_addr); fp++; }else if(!pps[2]){ //uint32_t *x_8_t_addr = fp; #ifdef HAVE_NEON memcpy(fp, neon_x8_t, neon_ee - neon_x8_t); if(sign < 0) { fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000; fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000; } fp += (neon_ee - neon_x8_t) / 4; //*fp++ = BL(fp+2, x_8_t_addr); #else *fp = BL(fp+2, x_8_addr); fp++; #endif }else{ *fp = BL(fp+2, x_8_addr); fp++; } pAddr = pps[1] * 4; pN = pps[0]; pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN); // fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT); count += 4; pps += 2; } *fp++ = 0xecbd8b10; *fp++ = POP_LR(); count++; #else POP(&fp, R15); POP(&fp, R14); POP(&fp, R13); POP(&fp, R12); POP(&fp, R11); POP(&fp, R10); POP(&fp, RBX); POP(&fp, RBP); RET(&fp); //uint8_t *pp = func; //int counter = 0; //do{ // printf("%02x ", *pp); // if(counter++ % 16 == 15) printf("\n"); //} while(++pp < fp); //printf("\n"); #endif // *fp++ = B(14); count++; //for(int i=0;i<(neon_x8 - neon_x4)/4;i++) // fprintf(stderr, "%08x\n", x_4_addr[i]); //fprintf(stderr, "\n"); //for(int i=0;i<count;i++) free(ps); if (mprotect(func, p->transform_size, PROT_READ | PROT_EXEC)) { perror("Couldn't mprotect"); exit(1); } #ifdef __APPLE__ sys_icache_invalidate(func, p->transform_size); #elif __ANDROID__ cacheflush((long)(func), (long)(func) + p->transform_size, 0); #elif __linux__ #ifdef __GNUC__ __clear_cache((long)(func), (long)(func) + p->transform_size); #endif #endif //fprintf(stderr, "size of transform %zu = %d\n", N, (fp-func)*4); p->transform = (void *) (start); }
void clear_insn_cache(u32 start, u32 end, int type) { sys_icache_invalidate((void*)start, end - start); }