Пример #1
0
gpointer
mono_arch_get_static_rgctx_trampoline (gpointer arg, gpointer addr)
{
	guint8 *code, *start;
	int buf_len;
	GSList *unwind_ops;

	MonoDomain *domain = mono_domain_get ();

	buf_len = 10;

	start = code = mono_domain_code_reserve (domain, buf_len);

	unwind_ops = mono_arch_get_cie_program ();

	x86_mov_reg_imm (code, MONO_ARCH_RGCTX_REG, arg);
	x86_jump_code (code, addr);
	g_assert ((code - start) <= buf_len);

	mono_arch_flush_icache (start, code - start);
	MONO_PROFILER_RAISE (jit_code_buffer, (start, code - start, MONO_PROFILER_CODE_BUFFER_GENERICS_TRAMPOLINE, NULL));

	mono_tramp_info_register (mono_tramp_info_create (NULL, start, code - start, NULL, unwind_ops), domain);

	return start;
}
Пример #2
0
void x86_mov_imm( struct x86_function *p, struct x86_reg dst, int imm )
{
   DUMP_RI( dst, imm );
   if(dst.mod == mod_REG)
      x86_mov_reg_imm(p, dst, imm);
   else
   {
      emit_1ub(p, 0xc7);
      emit_modrm_noreg(p, 0, dst);
      emit_1i(p, imm);
   }
}
Пример #3
0
/*
 * get_throw_trampoline:
 *
 *  Generate a call to mono_x86_throw_exception/
 * mono_x86_throw_corlib_exception.
 * If LLVM is true, generate code which assumes the caller is LLVM generated code, 
 * which doesn't push the arguments.
 */
static guint8*
get_throw_trampoline (const char *name, gboolean rethrow, gboolean llvm, gboolean corlib, gboolean llvm_abs, gboolean resume_unwind, MonoTrampInfo **info, gboolean aot)
{
	guint8 *start, *code, *labels [16];
	int i, stack_size, stack_offset, arg_offsets [5], regs_offset;
	MonoJumpInfo *ji = NULL;
	GSList *unwind_ops = NULL;
	guint kMaxCodeSize = 192;

	start = code = mono_global_codeman_reserve (kMaxCodeSize);

	stack_size = 128;

	/* 
	 * On apple, the stack is misaligned by the pushing of the return address.
	 */
	if (!llvm && corlib)
		/* On OSX, we don't generate alignment code to save space */
		stack_size += 4;
	else
		stack_size += MONO_ARCH_FRAME_ALIGNMENT - 4;

	/*
	 * The stack looks like this:
	 * <pc offset> (only if corlib is TRUE)
	 * <exception object>/<type token>
	 * <return addr> <- esp (unaligned on apple)
	 */

	unwind_ops = mono_arch_get_cie_program ();

	/* Alloc frame */
	x86_alu_reg_imm (code, X86_SUB, X86_ESP, stack_size);
	mono_add_unwind_op_def_cfa_offset (unwind_ops, code, start, stack_size + 4);

	arg_offsets [0] = 0;
	arg_offsets [1] = 4;
	arg_offsets [2] = 8;
	arg_offsets [3] = 12;
	regs_offset = 16;

	/* Save registers */
	for (i = 0; i < X86_NREG; ++i)
		if (i != X86_ESP)
			x86_mov_membase_reg (code, X86_ESP, regs_offset + (i * 4), i, 4);
	/* Calculate the offset between the current sp and the sp of the caller */
	if (llvm) {
		/* LLVM doesn't push the arguments */
		stack_offset = stack_size + 4;
	} else {
		if (corlib) {
			/* Two arguments */
			stack_offset = stack_size + 4 + 8;
#ifdef __APPLE__
			/* We don't generate stack alignment code on osx to save space */
#endif
		} else {
			/* One argument + stack alignment */
			stack_offset = stack_size + 4 + 4;
#ifdef __APPLE__
			/* Pop the alignment added by OP_THROW too */
			stack_offset += MONO_ARCH_FRAME_ALIGNMENT - 4;
#else
			if (mono_do_x86_stack_align)
				stack_offset += MONO_ARCH_FRAME_ALIGNMENT - 4;
#endif
		}
	}
	/* Save ESP */
	x86_lea_membase (code, X86_EAX, X86_ESP, stack_offset);
	x86_mov_membase_reg (code, X86_ESP, regs_offset + (X86_ESP * 4), X86_EAX, 4);

	/* Clear fp stack */
	labels [0] = code;
	x86_fnstsw (code);
	x86_shift_reg_imm (code, X86_SHR, X86_EAX, 11);
	x86_alu_reg_imm (code, X86_AND, X86_EAX, 7);
	x86_alu_reg_imm (code, X86_CMP, X86_EAX, 0);
	labels [1] = code;
	x86_branch8 (code, X86_CC_EQ, 0, FALSE);
	x86_fstp (code, 0);
	x86_jump_code (code, labels [0]);
	mono_x86_patch (labels [1], code);

	/* Set arg1 == regs */
	x86_lea_membase (code, X86_EAX, X86_ESP, regs_offset);
	x86_mov_membase_reg (code, X86_ESP, arg_offsets [0], X86_EAX, 4);
	/* Set arg2 == exc/ex_token_index */
	if (resume_unwind)
		x86_mov_reg_imm (code, X86_EAX, 0);
	else
		x86_mov_reg_membase (code, X86_EAX, X86_ESP, stack_size + 4, 4);
	x86_mov_membase_reg (code, X86_ESP, arg_offsets [1], X86_EAX, 4);
	/* Set arg3 == eip */
	if (llvm_abs)
		x86_alu_reg_reg (code, X86_XOR, X86_EAX, X86_EAX);
	else
		x86_mov_reg_membase (code, X86_EAX, X86_ESP, stack_size, 4);
	x86_mov_membase_reg (code, X86_ESP, arg_offsets [2], X86_EAX, 4);
	/* Set arg4 == rethrow/pc_offset */
	if (resume_unwind) {
		x86_mov_membase_imm (code, X86_ESP, arg_offsets [3], 0, 4);
	} else if (corlib) {
		x86_mov_reg_membase (code, X86_EAX, X86_ESP, stack_size + 8, 4);
		if (llvm_abs) {
			/* 
			 * The caller is LLVM code which passes the absolute address not a pc offset,
			 * so compensate by passing 0 as 'ip' and passing the negated abs address as
			 * the pc offset.
			 */
			x86_neg_reg (code, X86_EAX);
		}
		x86_mov_membase_reg (code, X86_ESP, arg_offsets [3], X86_EAX, 4);
	} else {
		x86_mov_membase_imm (code, X86_ESP, arg_offsets [3], rethrow, 4);
	}
	/* Make the call */
	if (aot) {
		// This can be called from runtime code, which can't guarantee that
		// ebx contains the got address.
		// So emit the got address loading code too
		code = mono_arch_emit_load_got_addr (start, code, NULL, &ji);
		code = mono_arch_emit_load_aotconst (start, code, &ji, MONO_PATCH_INFO_JIT_ICALL_ADDR, corlib ? "mono_x86_throw_corlib_exception" : "mono_x86_throw_exception");
		x86_call_reg (code, X86_EAX);
	} else {
		x86_call_code (code, resume_unwind ? (gpointer)(mono_x86_resume_unwind) : (corlib ? (gpointer)mono_x86_throw_corlib_exception : (gpointer)mono_x86_throw_exception));
	}
	x86_breakpoint (code);

	g_assert ((code - start) < kMaxCodeSize);

	if (info)
		*info = mono_tramp_info_create (name, start, code - start, ji, unwind_ops);
	else {
		GSList *l;

		for (l = unwind_ops; l; l = l->next)
			g_free (l->data);
		g_slist_free (unwind_ops);
	}

	mono_arch_flush_icache (start, code - start);
	mono_profiler_code_buffer_new (start, code - start, MONO_PROFILER_CODE_BUFFER_EXCEPTION_HANDLING, NULL);

	return start;
}
Пример #4
0
guchar*
mono_arch_create_generic_trampoline (MonoTrampolineType tramp_type, MonoTrampInfo **info, gboolean aot)
{
	const char *tramp_name;
	guint8 *buf, *code, *tramp, *br_ex_check;
	GSList *unwind_ops = NULL;
	MonoJumpInfo *ji = NULL;
	int i, offset, frame_size, regarray_offset, lmf_offset, caller_ip_offset, arg_offset;
	int cfa_offset; /* cfa = cfa_reg + cfa_offset */

	code = buf = mono_global_codeman_reserve (256);

	/* Note that there is a single argument to the trampoline
	 * and it is stored at: esp + pushed_args * sizeof (target_mgreg_t)
	 * the ret address is at: esp + (pushed_args + 1) * sizeof (target_mgreg_t)
	 */

	/* Compute frame offsets relative to the frame pointer %ebp */
	arg_offset = sizeof (target_mgreg_t);
	caller_ip_offset = 2 * sizeof (target_mgreg_t);
	offset = 0;
	offset += sizeof (MonoLMF);
	lmf_offset = -offset;
	offset += X86_NREG * sizeof (target_mgreg_t);
	regarray_offset = -offset;
	/* Argument area */
	offset += 4 * sizeof (target_mgreg_t);
	frame_size = ALIGN_TO (offset, MONO_ARCH_FRAME_ALIGNMENT);

	/* ret addr and arg are on the stack */
	cfa_offset = 2 * sizeof (target_mgreg_t);
	mono_add_unwind_op_def_cfa (unwind_ops, code, buf, X86_ESP, cfa_offset);
	// IP saved at CFA - 4
	mono_add_unwind_op_offset (unwind_ops, code, buf, X86_NREG, -4);

	/* Allocate frame */
	x86_push_reg (code, X86_EBP);
	cfa_offset += sizeof (target_mgreg_t);
	mono_add_unwind_op_def_cfa_offset (unwind_ops, code, buf, cfa_offset);
	mono_add_unwind_op_offset (unwind_ops, code, buf, X86_EBP, -cfa_offset);

	x86_mov_reg_reg (code, X86_EBP, X86_ESP);
	mono_add_unwind_op_def_cfa_reg (unwind_ops, code, buf, X86_EBP);

	/* There are three words on the stack, adding + 4 aligns the stack to 16, which is needed on osx */
	x86_alu_reg_imm (code, X86_SUB, X86_ESP, frame_size + sizeof (target_mgreg_t));

	/* Save all registers */
	for (i = X86_EAX; i <= X86_EDI; ++i) {
		int reg = i;

		if (i == X86_EBP) {
			/* Save original ebp */
			/* EAX is already saved */
			x86_mov_reg_membase (code, X86_EAX, X86_EBP, 0, sizeof (target_mgreg_t));
			reg = X86_EAX;
		} else if (i == X86_ESP) {
			/* Save original esp */
			/* EAX is already saved */
			x86_mov_reg_reg (code, X86_EAX, X86_EBP);
			/* Saved ebp + trampoline arg + return addr */
			x86_alu_reg_imm (code, X86_ADD, X86_EAX, 3 * sizeof (target_mgreg_t));
			reg = X86_EAX;
		}
		x86_mov_membase_reg (code, X86_EBP, regarray_offset + (i * sizeof (target_mgreg_t)), reg, sizeof (target_mgreg_t));
	}

	/* Setup LMF */
	/* eip */
	if (tramp_type == MONO_TRAMPOLINE_JUMP) {
		x86_mov_membase_imm (code, X86_EBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, eip), 0, sizeof (target_mgreg_t));
	} else {
		x86_mov_reg_membase (code, X86_EAX, X86_EBP, caller_ip_offset, sizeof (target_mgreg_t));
		x86_mov_membase_reg (code, X86_EBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, eip), X86_EAX, sizeof (target_mgreg_t));
	}
	/* method */
	if ((tramp_type == MONO_TRAMPOLINE_JIT) || (tramp_type == MONO_TRAMPOLINE_JUMP)) {
		x86_mov_reg_membase (code, X86_EAX, X86_EBP, arg_offset, sizeof (target_mgreg_t));
		x86_mov_membase_reg (code, X86_EBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), X86_EAX, sizeof (target_mgreg_t));
	} else {
		x86_mov_membase_imm (code, X86_EBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, method), 0, sizeof (target_mgreg_t));
	}
	/* esp */
	x86_mov_reg_membase (code, X86_EAX, X86_EBP, regarray_offset + (X86_ESP * sizeof (target_mgreg_t)), sizeof (target_mgreg_t));
	x86_mov_membase_reg (code, X86_EBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, esp), X86_EAX, sizeof (target_mgreg_t));
	/* callee save registers */
	x86_mov_reg_membase (code, X86_EAX, X86_EBP, regarray_offset + (X86_EBX * sizeof (target_mgreg_t)), sizeof (target_mgreg_t));
	x86_mov_membase_reg (code, X86_EBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, ebx), X86_EAX, sizeof (target_mgreg_t));
	x86_mov_reg_membase (code, X86_EAX, X86_EBP, regarray_offset + (X86_EDI * sizeof (target_mgreg_t)), sizeof (target_mgreg_t));
	x86_mov_membase_reg (code, X86_EBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, edi), X86_EAX, sizeof (target_mgreg_t));
	x86_mov_reg_membase (code, X86_EAX, X86_EBP, regarray_offset + (X86_ESI * sizeof (target_mgreg_t)), sizeof (target_mgreg_t));
	x86_mov_membase_reg (code, X86_EBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, esi), X86_EAX, sizeof (target_mgreg_t));
	x86_mov_reg_membase (code, X86_EAX, X86_EBP, regarray_offset + (X86_EBP * sizeof (target_mgreg_t)), sizeof (target_mgreg_t));
	x86_mov_membase_reg (code, X86_EBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, ebp), X86_EAX, sizeof (target_mgreg_t));

	/* Push LMF */
	/* get the address of lmf for the current thread */
	if (aot) {
		code = mono_arch_emit_load_aotconst (buf, code, &ji, MONO_PATCH_INFO_JIT_ICALL_ADDR, "mono_get_lmf_addr");
		x86_call_reg (code, X86_EAX);
	} else {
		x86_call_code (code, mono_get_lmf_addr);
	}
	/* lmf->lmf_addr = lmf_addr (%eax) */
	x86_mov_membase_reg (code, X86_EBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), X86_EAX, sizeof (target_mgreg_t));
	/* lmf->previous_lmf = *(lmf_addr) */
	x86_mov_reg_membase (code, X86_ECX, X86_EAX, 0, sizeof (target_mgreg_t));
	/* Signal to mono_arch_unwind_frame () that this is a trampoline frame */
	x86_alu_reg_imm (code, X86_ADD, X86_ECX, 1);
	x86_mov_membase_reg (code, X86_EBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), X86_ECX, sizeof (target_mgreg_t));
	/* *lmf_addr = lmf */
	x86_lea_membase (code, X86_ECX, X86_EBP, lmf_offset);
	x86_mov_membase_reg (code, X86_EAX, 0, X86_ECX, sizeof (target_mgreg_t));

	/* Call trampoline function */
	/* Arg 1 - registers */
	x86_lea_membase (code, X86_EAX, X86_EBP, regarray_offset);
	x86_mov_membase_reg (code, X86_ESP, (0 * sizeof (target_mgreg_t)), X86_EAX, sizeof (target_mgreg_t));
	/* Arg2 - calling code */
	if (tramp_type == MONO_TRAMPOLINE_JUMP) {
		x86_mov_membase_imm (code, X86_ESP, (1 * sizeof (target_mgreg_t)), 0, sizeof (target_mgreg_t));
	} else {
		x86_mov_reg_membase (code, X86_EAX, X86_EBP, caller_ip_offset, sizeof (target_mgreg_t));
		x86_mov_membase_reg (code, X86_ESP, (1 * sizeof (target_mgreg_t)), X86_EAX, sizeof (target_mgreg_t));
	}
	/* Arg3 - trampoline argument */
	x86_mov_reg_membase (code, X86_EAX, X86_EBP, arg_offset, sizeof (target_mgreg_t));
	x86_mov_membase_reg (code, X86_ESP, (2 * sizeof (target_mgreg_t)), X86_EAX, sizeof (target_mgreg_t));
	/* Arg4 - trampoline address */
	// FIXME:
	x86_mov_membase_imm (code, X86_ESP, (3 * sizeof (target_mgreg_t)), 0, sizeof (target_mgreg_t));

#ifdef __APPLE__
	/* check the stack is aligned after the ret ip is pushed */
	/*
	x86_mov_reg_reg (code, X86_EDX, X86_ESP);
	x86_alu_reg_imm (code, X86_AND, X86_EDX, 15);
	x86_alu_reg_imm (code, X86_CMP, X86_EDX, 0);
	x86_branch_disp (code, X86_CC_Z, 3, FALSE);
	x86_breakpoint (code);
	*/
#endif

	if (aot) {
		code = mono_arch_emit_load_aotconst (buf, code, &ji, MONO_PATCH_INFO_TRAMPOLINE_FUNC_ADDR, GINT_TO_POINTER (tramp_type));
		x86_call_reg (code, X86_EAX);
	} else {
		tramp = (guint8*)mono_get_trampoline_func (tramp_type);
		x86_call_code (code, tramp);
	}

	/*
	 * Overwrite the trampoline argument with the address we need to jump to,
	 * to free %eax.
	 */
	x86_mov_membase_reg (code, X86_EBP, arg_offset, X86_EAX, 4);

	/* Restore LMF */
	x86_mov_reg_membase (code, X86_EAX, X86_EBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, lmf_addr), sizeof (target_mgreg_t));
	x86_mov_reg_membase (code, X86_ECX, X86_EBP, lmf_offset + G_STRUCT_OFFSET (MonoLMF, previous_lmf), sizeof (target_mgreg_t));
	x86_alu_reg_imm (code, X86_SUB, X86_ECX, 1);
	x86_mov_membase_reg (code, X86_EAX, 0, X86_ECX, sizeof (target_mgreg_t));

	/* Check for interruptions */
	if (aot) {
		code = mono_arch_emit_load_aotconst (buf, code, &ji, MONO_PATCH_INFO_JIT_ICALL_ADDR, "mono_thread_force_interruption_checkpoint_noraise");
		x86_call_reg (code, X86_EAX);
	} else {
		x86_call_code (code, (guint8*)mono_thread_force_interruption_checkpoint_noraise);
	}

	x86_test_reg_reg (code, X86_EAX, X86_EAX);
	br_ex_check = code;
	x86_branch8 (code, X86_CC_Z, -1, 1);

	/*
	 * Exception case:
	 * We have an exception we want to throw in the caller's frame, so pop
	 * the trampoline frame and throw from the caller.
	 */
	x86_leave (code);
	/*
	 * The exception is in eax.
	 * We are calling the throw trampoline used by OP_THROW, so we have to setup the
	 * stack to look the same.
	 * The stack contains the ret addr, and the trampoline argument, the throw trampoline
	 * expects it to contain the ret addr and the exception. It also needs to be aligned
	 * after the exception is pushed.
	 */
	/* Align stack */
	x86_push_reg (code, X86_EAX);
	/* Push the exception */
	x86_push_reg (code, X86_EAX);
	//x86_breakpoint (code);
	/* Push the original return value */
	x86_push_membase (code, X86_ESP, 3 * 4);
	/*
	 * EH is initialized after trampolines, so get the address of the variable
	 * which contains throw_exception, and load it from there.
	 */
	if (aot) {
		/* Not really a jit icall */
		code = mono_arch_emit_load_aotconst (buf, code, &ji, MONO_PATCH_INFO_JIT_ICALL_ADDR, "rethrow_preserve_exception_addr");
	} else {
		x86_mov_reg_imm (code, X86_ECX, (guint8*)mono_get_rethrow_preserve_exception_addr ());
	}
	x86_mov_reg_membase (code, X86_ECX, X86_ECX, 0, sizeof (target_mgreg_t));
	x86_jump_reg (code, X86_ECX);

	/* Normal case */
	mono_x86_patch (br_ex_check, code);

	/* Restore registers */
	for (i = X86_EAX; i <= X86_EDI; ++i) {
		if (i == X86_ESP || i == X86_EBP)
			continue;
		if (i == X86_EAX && tramp_type != MONO_TRAMPOLINE_AOT_PLT)
			continue;
		x86_mov_reg_membase (code, i, X86_EBP, regarray_offset + (i * 4), 4);
	}

	/* Restore frame */
	x86_leave (code);
	cfa_offset -= sizeof (target_mgreg_t);
	mono_add_unwind_op_def_cfa (unwind_ops, code, buf, X86_ESP, cfa_offset);
	mono_add_unwind_op_same_value (unwind_ops, code, buf, X86_EBP);

	if (MONO_TRAMPOLINE_TYPE_MUST_RETURN (tramp_type)) {
		/* Load the value returned by the trampoline */
		x86_mov_reg_membase (code, X86_EAX, X86_ESP, 0, 4);
		/* The trampoline returns normally, pop the trampoline argument */
		x86_alu_reg_imm (code, X86_ADD, X86_ESP, 4);
		cfa_offset -= sizeof (target_mgreg_t);
		mono_add_unwind_op_def_cfa_offset (unwind_ops, code, buf, cfa_offset);
		x86_ret (code);
	} else {
		x86_ret (code);
	}

	g_assert ((code - buf) <= 256);
	MONO_PROFILER_RAISE (jit_code_buffer, (buf, code - buf, MONO_PROFILER_CODE_BUFFER_HELPER, NULL));

	tramp_name = mono_get_generic_trampoline_name (tramp_type);
	*info = mono_tramp_info_create (tramp_name, buf, code - buf, ji, unwind_ops);

	return buf;
}
Пример #5
0
void
_jit_gen_load_value(jit_gencode_t gen, int reg, int other_reg, jit_value_t value)
{
	jit_type_t type;
	int src_reg, other_src_reg;
	void *ptr;
	int offset;

	/* Make sure that we have sufficient space */
	jit_cache_setup_output(16);

	type = jit_type_normalize(value->type);

	/* Load zero */
	if(value->is_constant)
	{
		switch(type->kind)
		{
			case JIT_TYPE_SBYTE:
			case JIT_TYPE_UBYTE:
			case JIT_TYPE_SHORT:
			case JIT_TYPE_USHORT:
			case JIT_TYPE_INT:
			case JIT_TYPE_UINT:
			{
				if((jit_nint)(value->address) == 0)
				{
					x86_clear_reg(inst, _jit_reg_info[reg].cpu_reg);
				}
				else
				{
					x86_mov_reg_imm(inst, _jit_reg_info[reg].cpu_reg,
							(jit_nint)(value->address));
				}
			}
			break;

			case JIT_TYPE_LONG:
			case JIT_TYPE_ULONG:
			{
				jit_long long_value;
				long_value = jit_value_get_long_constant(value);
				if(long_value == 0)
				{
#ifdef JIT_NATIVE_INT64
					x86_clear_reg(inst, _jit_reg_info[reg].cpu_reg);
#else
					x86_clear_reg(inst, _jit_reg_info[reg].cpu_reg);
					x86_clear_reg(inst, _jit_reg_info[other_reg].cpu_reg);
#endif
				}
				else
				{
#ifdef JIT_NATIVE_INT64
					x86_mov_reg_imm(inst, _jit_reg_info[reg].cpu_reg,
							(jit_nint)long_value);
#else
					x86_mov_reg_imm(inst, _jit_reg_info[reg].cpu_reg,
							(jit_int)long_value);
					x86_mov_reg_imm(inst, _jit_reg_info[other_reg].cpu_reg,
							(jit_int)(long_value >> 32));
#endif
				}
			}
			break;

			case JIT_TYPE_FLOAT32:
			{
				jit_float32 float32_value;
				float32_value = jit_value_get_float32_constant(value);
				if(IS_WORD_REG(reg))
				{
					union
					{
						jit_float32 float32_value;
						jit_int int_value;
					} un;

					un.float32_value = float32_value;
					x86_mov_reg_imm(inst, _jit_reg_info[reg].cpu_reg, un.int_value);
				}
				else
				{
					if(float32_value == (jit_float32) 0.0)
					{
						x86_fldz(inst);
					}
					else if(float32_value == (jit_float32) 1.0)
					{
						x86_fld1(inst);
					}
					else
					{
						ptr = _jit_cache_alloc(&(gen->posn), sizeof(jit_float32));
						jit_memcpy(ptr, &float32_value, sizeof(float32_value));
						x86_fld(inst, ptr, 0);
					}
				}
			}
			break;

			case JIT_TYPE_FLOAT64:
			{
				jit_float64 float64_value;
				float64_value = jit_value_get_float64_constant(value);
				if(IS_WORD_REG(reg))
				{
					union
					{
						jit_float64 float64_value;
						jit_int int_value[2];
					} un;

					un.float64_value = float64_value;
					x86_mov_reg_imm(inst, _jit_reg_info[reg].cpu_reg,
							un.int_value[0]);
					x86_mov_reg_imm(inst, _jit_reg_info[other_reg].cpu_reg,
							un.int_value[1]);
				}
				else
				{
					if(float64_value == (jit_float64) 0.0)
					{
						x86_fldz(inst);
					}
					else if(float64_value == (jit_float64) 1.0)
					{
						x86_fld1(inst);
					}
					else
					{
						ptr = _jit_cache_alloc(&(gen->posn), sizeof(jit_float64));
						jit_memcpy(ptr, &float64_value, sizeof(float64_value));
						x86_fld(inst, ptr, 1);
					}
				}
			}
			break;

			case JIT_TYPE_NFLOAT:
			{
				jit_nfloat nfloat_value;
				nfloat_value = jit_value_get_nfloat_constant(value);
				if(IS_WORD_REG(reg) && sizeof(jit_nfloat) == sizeof(jit_float64))
				{
					union
					{
						jit_nfloat nfloat_value;
						jit_int int_value[2];
					} un;

					un.nfloat_value = nfloat_value;
					x86_mov_reg_imm(inst, _jit_reg_info[reg].cpu_reg,
							un.int_value[0]);
					x86_mov_reg_imm(inst, _jit_reg_info[other_reg].cpu_reg,
							un.int_value[1]);
				}
				else
				{
					if(nfloat_value == (jit_nfloat) 0.0)
					{
						x86_fldz(inst);
					}
					else if(nfloat_value == (jit_nfloat) 1.0)
					{
						x86_fld1(inst);
					}
					else
					{
						ptr = _jit_cache_alloc(&(gen->posn), sizeof(jit_nfloat));
						jit_memcpy(ptr, &nfloat_value, sizeof(nfloat_value));
						if(sizeof(jit_nfloat) == sizeof(jit_float64))
						{
							x86_fld(inst, ptr, 1);
						}
						else
						{
							x86_fld80_mem(inst, ptr);
						}
					}
				}
			}
			break;
		}
	}
	else if(value->in_register || value->in_global_register)
Пример #6
0
/*
 * get_throw_trampoline:
 *
 *  Generate a call to mono_x86_throw_exception/
 * mono_x86_throw_corlib_exception.
 * If LLVM is true, generate code which assumes the caller is LLVM generated code, 
 * which doesn't push the arguments.
 */
static guint8*
get_throw_trampoline (const char *name, gboolean rethrow, gboolean llvm, gboolean corlib, gboolean llvm_abs, gboolean resume_unwind, MonoTrampInfo **info, gboolean aot)
{
	guint8 *start, *code;
	int i, stack_size, stack_offset, arg_offsets [5], regs_offset;
	MonoJumpInfo *ji = NULL;
	GSList *unwind_ops = NULL;

	start = code = mono_global_codeman_reserve (128);

	stack_size = 128;

	/* 
	 * On apple, the stack is misaligned by the pushing of the return address.
	 */
	if (!llvm && corlib)
		/* On OSX, we don't generate alignment code to save space */
		stack_size += 4;
	else
		stack_size += MONO_ARCH_FRAME_ALIGNMENT - 4;

	/*
	 * The stack looks like this:
	 * <pc offset> (only if corlib is TRUE)
	 * <exception object>/<type token>
	 * <return addr> <- esp (unaligned on apple)
	 */

	mono_add_unwind_op_def_cfa (unwind_ops, (guint8*)NULL, (guint8*)NULL, X86_ESP, 4);
	mono_add_unwind_op_offset (unwind_ops, (guint8*)NULL, (guint8*)NULL, X86_NREG, -4);

	/* Alloc frame */
	x86_alu_reg_imm (code, X86_SUB, X86_ESP, stack_size);
	mono_add_unwind_op_def_cfa_offset (unwind_ops, code, start, stack_size + 4);

	arg_offsets [0] = 0;
	arg_offsets [1] = 4;
	arg_offsets [2] = 8;
	arg_offsets [3] = 12;
	regs_offset = 16;

	/* Save registers */
	for (i = 0; i < X86_NREG; ++i)
		if (i != X86_ESP)
			x86_mov_membase_reg (code, X86_ESP, regs_offset + (i * 4), i, 4);
	/* Calculate the offset between the current sp and the sp of the caller */
	if (llvm) {
		/* LLVM doesn't push the arguments */
		stack_offset = stack_size + 4;
	} else {
		if (corlib) {
			/* Two arguments */
			stack_offset = stack_size + 4 + 8;
#ifdef __APPLE__
			/* We don't generate stack alignment code on osx to save space */
#endif
		} else {
			/* One argument */
			stack_offset = stack_size + 4 + 4;
#ifdef __APPLE__
			/* Pop the alignment added by OP_THROW too */
			stack_offset += MONO_ARCH_FRAME_ALIGNMENT - 4;
#endif
		}
	}
	/* Save ESP */
	x86_lea_membase (code, X86_EAX, X86_ESP, stack_offset);
	x86_mov_membase_reg (code, X86_ESP, regs_offset + (X86_ESP * 4), X86_EAX, 4);

	/* Set arg1 == regs */
	x86_lea_membase (code, X86_EAX, X86_ESP, regs_offset);
	x86_mov_membase_reg (code, X86_ESP, arg_offsets [0], X86_EAX, 4);
	/* Set arg2 == exc/ex_token_index */
	if (resume_unwind)
		x86_mov_reg_imm (code, X86_EAX, 0);
	else
		x86_mov_reg_membase (code, X86_EAX, X86_ESP, stack_size + 4, 4);
	x86_mov_membase_reg (code, X86_ESP, arg_offsets [1], X86_EAX, 4);
	/* Set arg3 == eip */
	if (llvm_abs)
		x86_alu_reg_reg (code, X86_XOR, X86_EAX, X86_EAX);
	else
		x86_mov_reg_membase (code, X86_EAX, X86_ESP, stack_size, 4);
	x86_mov_membase_reg (code, X86_ESP, arg_offsets [2], X86_EAX, 4);
	/* Set arg4 == rethrow/pc_offset */
	if (resume_unwind) {
		x86_mov_membase_imm (code, X86_ESP, arg_offsets [3], 0, 4);
	} else if (corlib) {
		x86_mov_reg_membase (code, X86_EAX, X86_ESP, stack_size + 8, 4);
		if (llvm_abs) {
			/* 
			 * The caller is LLVM code which passes the absolute address not a pc offset,
			 * so compensate by passing 0 as 'ip' and passing the negated abs address as
			 * the pc offset.
			 */
			x86_neg_reg (code, X86_EAX);
		}
		x86_mov_membase_reg (code, X86_ESP, arg_offsets [3], X86_EAX, 4);
	} else {
		x86_mov_membase_imm (code, X86_ESP, arg_offsets [3], rethrow, 4);
	}
	/* Make the call */
	if (aot) {
		// This can be called from runtime code, which can't guarantee that
		// ebx contains the got address.
		// So emit the got address loading code too
		code = mono_arch_emit_load_got_addr (start, code, NULL, &ji);
		code = mono_arch_emit_load_aotconst (start, code, &ji, MONO_PATCH_INFO_JIT_ICALL_ADDR, corlib ? "mono_x86_throw_corlib_exception" : "mono_x86_throw_exception");
		x86_call_reg (code, X86_EAX);
	} else {
		x86_call_code (code, resume_unwind ? (mono_x86_resume_unwind) : (corlib ? (gpointer)mono_x86_throw_corlib_exception : (gpointer)mono_x86_throw_exception));
	}
	x86_breakpoint (code);

	g_assert ((code - start) < 128);

	if (info)
		*info = mono_tramp_info_create (g_strdup (name), start, code - start, ji, unwind_ops);

	return start;
}
Пример #7
0
transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
{
    uint32_t offsets[8] = {0, 4*N, 2*N, 6*N, N, 5*N, 7*N, 3*N};
    uint32_t offsets_o[8] = {0, 4*N, 2*N, 6*N, 7*N, 3*N, N, 5*N};

    int32_t pAddr = 0;
    int32_t pN = 0;
    int32_t pLUT = 0;

    insns_t  *fp;
    insns_t  *start;
    insns_t  *x_4_addr;
    insns_t  *x_8_addr;
    uint32_t  loop_count;

    int       count;
    ptrdiff_t len;

    size_t   *ps;
    size_t   *pps;

    count = ffts_tree_count(N, leaf_N, 0) + 1;

    ps = pps = malloc(2 * count * sizeof(*ps));
    if (!ps) {
        return NULL;
    }

    ffts_elaborate_tree(&pps, N, leaf_N, 0);

    pps[0] = 0;
    pps[1] = 0;

    pps = ps;

#ifdef HAVE_SSE
    if (sign < 0) {
        p->constants = (const void*) sse_constants;
    } else {
        p->constants = (const void*) sse_constants_inv;
    }
#endif

    fp = (insns_t*) p->transform_base;

    /* generate base cases */
    x_4_addr = generate_size4_base_case(&fp, sign);
    x_8_addr = generate_size8_base_case(&fp, sign);

#ifdef __arm__
    start = generate_prologue(&fp, p);

#ifdef HAVE_NEON
    memcpy(fp, neon_ee, neon_oo - neon_ee);
    if (sign < 0) {
        fp[33] ^= 0x00200000;
        fp[37] ^= 0x00200000;
        fp[38] ^= 0x00200000;
        fp[39] ^= 0x00200000;
        fp[40] ^= 0x00200000;
        fp[41] ^= 0x00200000;
        fp[44] ^= 0x00200000;
        fp[45] ^= 0x00200000;
        fp[46] ^= 0x00200000;
        fp[47] ^= 0x00200000;
        fp[48] ^= 0x00200000;
        fp[57] ^= 0x00200000;
    }

    fp += (neon_oo - neon_ee) / 4;
#else
    memcpy(fp, vfp_e, vfp_o - vfp_e);

    if (sign > 0) {
        fp[64] ^= 0x00000040;
        fp[65] ^= 0x00000040;
        fp[68] ^= 0x00000040;
        fp[75] ^= 0x00000040;
        fp[76] ^= 0x00000040;
        fp[79] ^= 0x00000040;
        fp[80] ^= 0x00000040;
        fp[83] ^= 0x00000040;
        fp[84] ^= 0x00000040;
        fp[87] ^= 0x00000040;
        fp[91] ^= 0x00000040;
        fp[93] ^= 0x00000040;
    }
    fp += (vfp_o - vfp_e) / 4;
#endif
#else
    /* generate functions */
    start = generate_prologue(&fp, p);

    loop_count = 4 * p->i0;
    generate_leaf_init(&fp, loop_count);

    if (ffts_ctzl(N) & 1) {
        generate_leaf_ee(&fp, offsets, p->i1 ? 6 : 0);

        if (p->i1) {
            loop_count += 4 * p->i1;
            generate_leaf_oo(&fp, loop_count, offsets_o, 7);
        }

        loop_count += 4;
        generate_leaf_oe(&fp, offsets_o);
    } else {
        generate_leaf_ee(&fp, offsets, N >= 256 ? 2 : 8);

        loop_count += 4;
        generate_leaf_eo(&fp, offsets);

        if (p->i1) {
            loop_count += 4 * p->i1;
            generate_leaf_oo(&fp, loop_count, offsets_o, N >= 256 ? 4 : 7);
        }
    }

    if (p->i1) {
        uint32_t offsets_oe[8] = {7*N, 3*N, N, 5*N, 0, 4*N, 6*N, 2*N};

        loop_count += 4 * p->i1;

        /* align loop/jump destination */
#ifdef _M_X64
        x86_mov_reg_imm(fp, X86_EBX, loop_count);
#else
        x86_mov_reg_imm(fp, X86_ECX, loop_count);
        ffts_align_mem16(&fp, 9);
#endif

        generate_leaf_ee(&fp, offsets_oe, 0);
    }

    generate_transform_init(&fp);

    /* generate subtransform calls */
    count = 2;
    while (pps[0]) {
        size_t ws_is;

        if (!pN) {
#ifdef _M_X64
            x86_mov_reg_imm(fp, X86_EBX, pps[0]);
#else
            x86_mov_reg_imm(fp, X86_ECX, pps[0] / 4);
#endif
        } else {
            int offset = (4 * pps[1]) - pAddr;
            if (offset) {
#ifdef _M_X64
                x64_alu_reg_imm_size(fp, X86_ADD, X64_R8, offset, 8);
#else
                x64_alu_reg_imm_size(fp, X86_ADD, X64_RDX, offset, 8);
#endif
            }

            if (pps[0] > leaf_N && pps[0] - pN) {
                int factor = ffts_ctzl(pps[0]) - ffts_ctzl(pN);

#ifdef _M_X64
                if (factor > 0) {
                    x86_shift_reg_imm(fp, X86_SHL, X86_EBX, factor);
                } else {
                    x86_shift_reg_imm(fp, X86_SHR, X86_EBX, -factor);
                }
#else
                if (factor > 0) {
                    x86_shift_reg_imm(fp, X86_SHL, X86_ECX, factor);
                } else {
                    x86_shift_reg_imm(fp, X86_SHR, X86_ECX, -factor);
                }
#endif
            }
        }

        ws_is = 8 * p->ws_is[ffts_ctzl(pps[0] / leaf_N) - 1];
        if (ws_is != pLUT) {
            int offset = (int) (ws_is - pLUT);

#ifdef _M_X64
            x64_alu_reg_imm_size(fp, X86_ADD, X64_R9, offset, 8);
#else
            x64_alu_reg_imm_size(fp, X86_ADD, X64_R8, offset, 8);
#endif
        }

        if (pps[0] == 2 * leaf_N) {
            x64_call_code(fp, x_4_addr);
        } else {
            x64_call_code(fp, x_8_addr);
        }

        pAddr = 4 * pps[1];
        if (pps[0] > leaf_N) {
            pN = pps[0];
        }

        pLUT = ws_is;//LUT_offset(pps[0], leafN);
        //fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
        count += 4;
        pps += 2;
    }
#endif

#ifdef __arm__
#ifdef HAVE_NEON
    if (ffts_ctzl(N) & 1) {
        ADDI(&fp, 2, 7, 0);
        ADDI(&fp, 7, 9, 0);
        ADDI(&fp, 9, 2, 0);

        ADDI(&fp, 2, 8, 0);
        ADDI(&fp, 8, 10, 0);
        ADDI(&fp, 10, 2, 0);

        if(p->i1) {
            MOVI(&fp, 11, p->i1);
            memcpy(fp, neon_oo, neon_eo - neon_oo);
            if(sign < 0) {
                fp[12] ^= 0x00200000;
                fp[13] ^= 0x00200000;
                fp[14] ^= 0x00200000;
                fp[15] ^= 0x00200000;
                fp[27] ^= 0x00200000;
                fp[29] ^= 0x00200000;
                fp[30] ^= 0x00200000;
                fp[31] ^= 0x00200000;
                fp[46] ^= 0x00200000;
                fp[47] ^= 0x00200000;
                fp[48] ^= 0x00200000;
                fp[57] ^= 0x00200000;
            }
            fp += (neon_eo - neon_oo) / 4;
        }

        *fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p));
        fp++;

        memcpy(fp, neon_oe, neon_end - neon_oe);
        if(sign < 0) {
            fp[19] ^= 0x00200000;
            fp[20] ^= 0x00200000;
            fp[22] ^= 0x00200000;
            fp[23] ^= 0x00200000;
            fp[37] ^= 0x00200000;
            fp[38] ^= 0x00200000;
            fp[40] ^= 0x00200000;
            fp[41] ^= 0x00200000;
            fp[64] ^= 0x00200000;
            fp[65] ^= 0x00200000;
            fp[66] ^= 0x00200000;
            fp[67] ^= 0x00200000;
        }
        fp += (neon_end - neon_oe) / 4;

    } else {

        *fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p));
        fp++;

        memcpy(fp, neon_eo, neon_oe - neon_eo);
        if(sign < 0) {
            fp[10] ^= 0x00200000;
            fp[11] ^= 0x00200000;
            fp[13] ^= 0x00200000;
            fp[14] ^= 0x00200000;
            fp[31] ^= 0x00200000;
            fp[33] ^= 0x00200000;
            fp[34] ^= 0x00200000;
            fp[35] ^= 0x00200000;
            fp[59] ^= 0x00200000;
            fp[60] ^= 0x00200000;
            fp[61] ^= 0x00200000;
            fp[62] ^= 0x00200000;
        }
        fp += (neon_oe - neon_eo) / 4;

        ADDI(&fp, 2, 7, 0);
        ADDI(&fp, 7, 9, 0);
        ADDI(&fp, 9, 2, 0);

        ADDI(&fp, 2, 8, 0);
        ADDI(&fp, 8, 10, 0);
        ADDI(&fp, 10, 2, 0);

        if(p->i1) {
            MOVI(&fp, 11, p->i1);
            memcpy(fp, neon_oo, neon_eo - neon_oo);
            if(sign < 0) {
                fp[12] ^= 0x00200000;
                fp[13] ^= 0x00200000;
                fp[14] ^= 0x00200000;
                fp[15] ^= 0x00200000;
                fp[27] ^= 0x00200000;
                fp[29] ^= 0x00200000;
                fp[30] ^= 0x00200000;
                fp[31] ^= 0x00200000;
                fp[46] ^= 0x00200000;
                fp[47] ^= 0x00200000;
                fp[48] ^= 0x00200000;
                fp[57] ^= 0x00200000;
            }
            fp += (neon_eo - neon_oo) / 4;
        }
    }

    if(p->i1) {
        ADDI(&fp, 2, 3, 0);
        ADDI(&fp, 3, 7, 0);
        ADDI(&fp, 7, 2, 0);

        ADDI(&fp, 2, 4, 0);
        ADDI(&fp, 4, 8, 0);
        ADDI(&fp, 8, 2, 0);

        ADDI(&fp, 2, 5, 0);
        ADDI(&fp, 5, 9, 0);
        ADDI(&fp, 9, 2, 0);

        ADDI(&fp, 2, 6, 0);
        ADDI(&fp, 6, 10, 0);
        ADDI(&fp, 10, 2, 0);

        ADDI(&fp, 2, 9, 0);
        ADDI(&fp, 9, 10, 0);
        ADDI(&fp, 10, 2, 0);

        *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p));
        fp++;
        MOVI(&fp, 11, p->i1);
        memcpy(fp, neon_ee, neon_oo - neon_ee);
        if(sign < 0) {
            fp[33] ^= 0x00200000;
            fp[37] ^= 0x00200000;
            fp[38] ^= 0x00200000;
            fp[39] ^= 0x00200000;
            fp[40] ^= 0x00200000;
            fp[41] ^= 0x00200000;
            fp[44] ^= 0x00200000;
            fp[45] ^= 0x00200000;
            fp[46] ^= 0x00200000;
            fp[47] ^= 0x00200000;
            fp[48] ^= 0x00200000;
            fp[57] ^= 0x00200000;
        }
        fp += (neon_oo - neon_ee) / 4;
    }
#else
    ADDI(&fp, 2, 7, 0);
    ADDI(&fp, 7, 9, 0);
    ADDI(&fp, 9, 2, 0);

    ADDI(&fp, 2, 8, 0);
    ADDI(&fp, 8, 10, 0);
    ADDI(&fp, 10, 2, 0);

    MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1);
    memcpy(fp, vfp_o, vfp_x4 - vfp_o);
    if(sign > 0) {
        fp[22] ^= 0x00000040;
        fp[24] ^= 0x00000040;
        fp[25] ^= 0x00000040;
        fp[26] ^= 0x00000040;
        fp[62] ^= 0x00000040;
        fp[64] ^= 0x00000040;
        fp[65] ^= 0x00000040;
        fp[66] ^= 0x00000040;
    }
    fp += (vfp_x4 - vfp_o) / 4;

    ADDI(&fp, 2, 3, 0);
    ADDI(&fp, 3, 7, 0);
    ADDI(&fp, 7, 2, 0);

    ADDI(&fp, 2, 4, 0);
    ADDI(&fp, 4, 8, 0);
    ADDI(&fp, 8, 2, 0);

    ADDI(&fp, 2, 5, 0);
    ADDI(&fp, 5, 9, 0);
    ADDI(&fp, 9, 2, 0);

    ADDI(&fp, 2, 6, 0);
    ADDI(&fp, 6, 10, 0);
    ADDI(&fp, 10, 2, 0);

    ADDI(&fp, 2, 9, 0);
    ADDI(&fp, 9, 10, 0);
    ADDI(&fp, 10, 2, 0);

    *fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p));
    fp++;
    MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1);
    memcpy(fp, vfp_e, vfp_o - vfp_e);
    if(sign > 0) {
        fp[64] ^= 0x00000040;
        fp[65] ^= 0x00000040;
        fp[68] ^= 0x00000040;
        fp[75] ^= 0x00000040;
        fp[76] ^= 0x00000040;
        fp[79] ^= 0x00000040;
        fp[80] ^= 0x00000040;
        fp[83] ^= 0x00000040;
        fp[84] ^= 0x00000040;
        fp[87] ^= 0x00000040;
        fp[91] ^= 0x00000040;
        fp[93] ^= 0x00000040;
    }
    fp += (vfp_o - vfp_e) / 4;

#endif
    *fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p));
    fp++; // load offsets into r12
    //ADDI(&fp, 2, 1, 0);
    MOVI(&fp, 1, 0);

    // args: r0 - out
    //       r1 - N
    //       r2 - ws
    //	ADDI(&fp, 3, 1, 0); // put N into r3 for counter

    count = 2;
    while(pps[0]) {

        //	fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr);
        if(!pN) {
            MOVI(&fp, 1, pps[0]);
        } else {
            if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr);
            if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN);
        }

        if (p->ws_is[ffts_ctzl(pps[0]/leaf_N)-1]*8 - pLUT) {
            ADDI(&fp, 2, 2, p->ws_is[ffts_ctzl(pps[0]/leaf_N)-1]*8 - pLUT);
        }

        if(pps[0] == 2 * leaf_N) {
            *fp = BL(fp+2, x_4_addr);
            fp++;
        } else if(!pps[2]) {
            //uint32_t *x_8_t_addr = fp;
#ifdef HAVE_NEON
            memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
            if(sign < 0) {
                fp[31] ^= 0x00200000;
                fp[32] ^= 0x00200000;
                fp[33] ^= 0x00200000;
                fp[34] ^= 0x00200000;
                fp[65] ^= 0x00200000;
                fp[66] ^= 0x00200000;
                fp[70] ^= 0x00200000;
                fp[74] ^= 0x00200000;
                fp[97] ^= 0x00200000;
                fp[98] ^= 0x00200000;
                fp[102] ^= 0x00200000;
                fp[104] ^= 0x00200000;
            }
            fp += (neon_ee - neon_x8_t) / 4;
            //*fp++ = BL(fp+2, x_8_t_addr);

#else
            *fp = BL(fp+2, x_8_addr);
            fp++;
#endif
        } else {
            *fp = BL(fp+2, x_8_addr);
            fp++;
        }

        pAddr = pps[1] * 4;
        pN = pps[0];
        pLUT = p->ws_is[ffts_ctzl(pps[0]/leaf_N)-1]*8;//LUT_offset(pps[0], leafN);
        //	fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
        count += 4;
        pps += 2;
    }

    *fp++ = 0xecbd8b10;
    *fp++ = POP_LR();
    count++;
#else
    generate_epilogue(&fp);
#endif

    //	*fp++ = B(14); count++;

    //for(int i=0;i<(neon_x8 - neon_x4)/4;i++)
    //	fprintf(stderr, "%08x\n", x_4_addr[i]);
    //fprintf(stderr, "\n");
    //for(int i=0;i<count;i++)

    //fprintf(stderr, "size of transform %u = %d\n", N, (fp - x_8_addr) * sizeof(*fp));

    free(ps);

#if defined(_MSC_VER)
#pragma warning(push)

    /* disable type cast warning from data pointer to function pointer */
#pragma warning(disable : 4055)
#endif

    return (transform_func_t) start;

#if defined(_MSC_VER)
#pragma warning(pop)
#endif
}