static void slow_call_thr_specific(MacroAssembler* _masm, Register thread) {

  // slow call to of thr_getspecific
  // int thr_getspecific(thread_key_t key, void **value);
  // Consider using pthread_getspecific instead.

__  push(0);                                                            // allocate space for return value
  if (thread != rax) __ push(rax);                                      // save rax, if caller still wants it
__  push(rcx);                                                          // save caller save
__  push(rdx);                                                          // save caller save
  if (thread != rax) {
__    lea(thread, Address(rsp, 3 * sizeof(int)));                       // address of return value
  } else {
__    lea(thread, Address(rsp, 2 * sizeof(int)));                       // address of return value
  }
__  push(thread);                                                       // and pass the address
__  push(ThreadLocalStorage::thread_index());                           // the key
__  call(RuntimeAddress(CAST_FROM_FN_PTR(address, thr_getspecific)));
__  increment(rsp, 2 * wordSize);
__  pop(rdx);
__  pop(rcx);
  if (thread != rax) __ pop(rax);
__  pop(thread);

}
Beispiel #2
0
// Helper to insert argument slots into the stack.
// arg_slots must be a multiple of stack_move_unit() and <= 0
void MethodHandles::insert_arg_slots(MacroAssembler* _masm,
                                     RegisterOrConstant arg_slots,
                                     int arg_mask,
                                     Register rax_argslot,
                                     Register rbx_temp, Register rdx_temp, Register temp3_reg) {
  assert(temp3_reg == noreg, "temp3 not required");
  assert_different_registers(rax_argslot, rbx_temp, rdx_temp,
                             (!arg_slots.is_register() ? rsp : arg_slots.as_register()));

#ifdef ASSERT
  verify_argslot(_masm, rax_argslot, "insertion point must fall within current frame");
  if (arg_slots.is_register()) {
    Label L_ok, L_bad;
    __ cmpptr(arg_slots.as_register(), (int32_t) NULL_WORD);
    __ jccb(Assembler::greater, L_bad);
    __ testl(arg_slots.as_register(), -stack_move_unit() - 1);
    __ jccb(Assembler::zero, L_ok);
    __ bind(L_bad);
    __ stop("assert arg_slots <= 0 and clear low bits");
    __ bind(L_ok);
  } else {
    assert(arg_slots.as_constant() <= 0, "");
    assert(arg_slots.as_constant() % -stack_move_unit() == 0, "");
  }
#endif //ASSERT

#ifdef _LP64
  if (arg_slots.is_register()) {
    // clean high bits of stack motion register (was loaded as an int)
    __ movslq(arg_slots.as_register(), arg_slots.as_register());
  }
#endif

  // Make space on the stack for the inserted argument(s).
  // Then pull down everything shallower than rax_argslot.
  // The stacked return address gets pulled down with everything else.
  // That is, copy [rsp, argslot) downward by -size words.  In pseudo-code:
  //   rsp -= size;
  //   for (rdx = rsp + size; rdx < argslot; rdx++)
  //     rdx[-size] = rdx[0]
  //   argslot -= size;
  BLOCK_COMMENT("insert_arg_slots {");
  __ mov(rdx_temp, rsp);                        // source pointer for copy
  __ lea(rsp, Address(rsp, arg_slots, Address::times_ptr));
  {
    Label loop;
    __ BIND(loop);
    // pull one word down each time through the loop
    __ movptr(rbx_temp, Address(rdx_temp, 0));
    __ movptr(Address(rdx_temp, arg_slots, Address::times_ptr), rbx_temp);
    __ addptr(rdx_temp, wordSize);
    __ cmpptr(rdx_temp, rax_argslot);
    __ jccb(Assembler::less, loop);
  }

  // Now move the argslot down, to point to the opened-up space.
  __ lea(rax_argslot, Address(rax_argslot, arg_slots, Address::times_ptr));
  BLOCK_COMMENT("} insert_arg_slots");
}
Beispiel #3
0
void main()
{
	int n;
	void inorden (struct nodo *raiz);
	struct nodo *raiz=NULL;
	int ins_avl(struct nodo **,int);
	int lea();
	printf ("lea n \n"); n = lea();
	while (n != 9999) {
		ins_avl (&raiz,n);
		printf ("lea n \n"); n = lea();
	}
	inorden(raiz);
}
Beispiel #4
0
// inputs should be preserved outside if required since we do a call
// num_std_need_to_save registers will be preserved
char * m2n_gen_push_m2n(char * buf, Method_Handle method, 
                        frame_type current_frame_type, 
                        bool handles,
                        unsigned num_callee_saves,
                        unsigned num_std_need_to_save,
                        I_32 bytes_to_m2n_top) {
    // skip callee-saves registers
    bytes_to_m2n_top -= num_callee_saves * LcgEM64TContext::GR_SIZE;
    // TODO: check if it makes sense to save all callee-saves registers here
    //store rest of callee-saves registers
    for (unsigned i = num_callee_saves; i < LcgEM64TContext::MAX_GR_LOCALS; i++) {
        bytes_to_m2n_top -= LcgEM64TContext::GR_SIZE;
        buf = mov(buf,
            M_Base_Opnd(rsp_reg, bytes_to_m2n_top),
            LcgEM64TContext::get_reg_from_map(LcgEM64TContext::GR_LOCALS_OFFSET + i),
            size_64);        
    }
    // init pop_regs to null
    bytes_to_m2n_top -= LcgEM64TContext::GR_SIZE;
    buf = mov(buf, M_Base_Opnd(rsp_reg, bytes_to_m2n_top),
        Imm_Opnd(size_32, 0), size_64);
    // store current_frame_type
    bytes_to_m2n_top -= LcgEM64TContext::GR_SIZE;
    assert(fit32(current_frame_type));
    buf = mov(buf, M_Base_Opnd(rsp_reg, bytes_to_m2n_top),
        Imm_Opnd(size_32, current_frame_type), size_64);
    // store a method associated with the current m2n frame
    bytes_to_m2n_top -= LcgEM64TContext::GR_SIZE;
    if (fit32((int64)method)) {
        buf = mov(buf, M_Base_Opnd(rsp_reg, bytes_to_m2n_top),
            Imm_Opnd(size_32, (int64)method), size_64);
    } else {
        buf = mov(buf, rax_opnd, Imm_Opnd(size_64, (int64)method), size_64);
        buf = mov(buf, M_Base_Opnd(rsp_reg, bytes_to_m2n_top), rax_opnd);
    }
    // store local object handles
    bytes_to_m2n_top -= LcgEM64TContext::GR_SIZE;
    buf = mov(buf, M_Base_Opnd(rsp_reg, bytes_to_m2n_top),
        Imm_Opnd(size_64, (int64)0), size_64);

    // move pointer to the current VM_Thread structure to rax
    buf = m2n_gen_ts_to_register(buf, &rax_opnd,
        num_callee_saves, LcgEM64TContext::MAX_GR_LOCALS,
        num_std_need_to_save, 0);
    
    // shift to the last_m2n_frame field
    I_32 last_m2n_frame_offset = (I_32)(int64)&((VM_thread*)0)->last_m2n_frame;
    buf = alu(buf, add_opc,  rax_opnd,  Imm_Opnd(size_32, last_m2n_frame_offset), size_64);
    // store pointer to pointer to last m2n frame
    bytes_to_m2n_top -= LcgEM64TContext::GR_SIZE;
    buf = mov(buf, M_Base_Opnd(rsp_reg, bytes_to_m2n_top), rax_opnd, size_64);
    // save pointer to the previous m2n frame
    bytes_to_m2n_top -= LcgEM64TContext::GR_SIZE;
    buf = mov(buf, r9_opnd, M_Base_Opnd(rax_reg, 0));
    buf = mov(buf, M_Base_Opnd(rsp_reg, bytes_to_m2n_top), r9_opnd, size_64);
    // update last m2n frame of the current thread
    buf = lea(buf, r9_opnd, M_Base_Opnd(rsp_reg, bytes_to_m2n_top));
    buf = mov(buf,  M_Base_Opnd(rax_reg, 0), r9_opnd, size_64);
    return buf;
}
Beispiel #5
0
	Code()
		: Xbyak::CodeGenerator(sizeof(buf), buf)
	{
		puts("generate");
		printf("ptr=%p, %p\n", getCode(), buf);
		Xbyak::CodeArray::protect(buf, sizeof(buf), true);
#ifdef XBYAK32
		mov(eax, ptr [esp + 4]);
		add(eax, ptr [esp + 8]);
#elif defined(XBYAK64_WIN)
		lea(rax, ptr [rcx + rdx]);
#else
		lea(rax, ptr [rdi + rsi]);
#endif
		ret();
	}
Beispiel #6
0
int lea_grafo (V grafo[],V g_invertido[],
			int *nv, int *nact )
{
	void ins_lista (V g[],int v, int ad,
			int act, int tiempo);
	int v,ad,i,n,act,tiempo;
	int lea();
	PR("De numero de vertices...");
	SALTO;
	*nv = n = lea();
	PR("De numero de actividades...");
	SALTO;
	*nact = lea();
	for (i=1; i <= n; i++) {
		grafo[i].v = g_invertido [i].v = 0;
		grafo[i].cab = g_invertido [i].cab = NULL;
	}
	PR ("Lea el primer vertice. 99 para terminar...");
	SALTO;
	v = lea();
	grafo[v].v = v;
	while (v != 99) {
		PR ("Lea adjunto, actividad, tiempo al vertice");
		printf ("%d ",v);
		PR(". 99 para terminar");
		SALTO;
		ad = lea(); act = lea(); tiempo = lea();
		while (ad != 99) {
			ins_lista (grafo,v,ad,act,tiempo);
			ins_lista (g_invertido,ad,v,act,tiempo);
			PR ("Lea adjunto, actividad, tiempo al vertice ");
			printf ("%d ",v);
			PR(". 99 para terminar");
			SALTO;
			ad = lea(); act = lea(); tiempo = lea();
		}
		PR ("Lea otro vertice. 99 para terminar...");
		SALTO;
		v = lea();
		grafo[v].v = v;
	}
	return (n);
}
void InterpreterRuntime::SignatureHandlerGenerator::generate(uint64_t fingerprint) {
  // generate code to handle arguments
  iterate(fingerprint);

  // return result handler
  __ lea(rax, ExternalAddress(Interpreter::result_handler(method()->result_type())));
  __ ret(0);

  __ flush();
}
void ArrayCopyStub::emit_code(LIR_Assembler* ce) {
  //---------------slow case: call to native-----------------
  __ bind(_entry);
  // Figure out where the args should go
  // This should really convert the IntrinsicID to the Method* and signature
  // but I don't know how to do that.
  //
  VMRegPair args[5];
  BasicType signature[5] = { T_OBJECT, T_INT, T_OBJECT, T_INT, T_INT};
  SharedRuntime::java_calling_convention(signature, args, 5, true);

  // push parameters
  // (src, src_pos, dest, destPos, length)
  Register r[5];
  r[0] = src()->as_register();
  r[1] = src_pos()->as_register();
  r[2] = dst()->as_register();
  r[3] = dst_pos()->as_register();
  r[4] = length()->as_register();

  // next registers will get stored on the stack
  for (int i = 0; i < 5 ; i++ ) {
    VMReg r_1 = args[i].first();
    if (r_1->is_stack()) {
      int st_off = r_1->reg2stack() * wordSize;
      __ str (r[i], Address(sp, st_off));
    } else {
      assert(r[i] == args[i].first()->as_Register(), "Wrong register for arg ");
    }
  }

  ce->align_call(lir_static_call);

  ce->emit_static_call_stub();
  if (ce->compilation()->bailed_out()) {
    return; // CodeCache is full
  }
  Address resolve(SharedRuntime::get_resolve_static_call_stub(),
                  relocInfo::static_call_type);
  address call = __ trampoline_call(resolve);
  if (call == NULL) {
    ce->bailout("trampoline stub overflow");
    return;
  }
  ce->add_call_info_here(info());

#ifndef PRODUCT
  __ lea(rscratch2, ExternalAddress((address)&Runtime1::_arraycopy_slowcase_cnt));
  __ incrementw(Address(rscratch2));
#endif

  __ b(_continuation);
}
Beispiel #9
0
int lea_grafo (int grafo[][MAXIMO])
{
	int v,ad,i,j,n,costo,lea();

	PR("De numero de vertices...");
	SALTO;
	n = lea();
	for (i=1; i <= n; i++)
		for (j=1; j <=n; j++)
			grafo[i][j] = 32767;
	PR ("Vertice  ... ");
	v = 1;
	printf ("%d",v);
	SALTO;
	while (v <= n) {
		PR ("Lea el primer adjunto al vertice ");
		printf ("%d ",v);
		PR(". 99 para terminar");
		SALTO;
		ad = lea();
		while (ad != 99) {
			PR("Lea costo del arco");SALTO;
			costo = lea();
			grafo [v][ad] = costo;
			PR ("Lea otro adjunto al vertice ");
			printf ("%d ",v);
			PR(". 99 para terminar");
			SALTO;
			ad = lea();
		}
		PR ("Vertice ...");
		v++;
		printf ("%d ",v);
		SALTO;
	}
	return (n);
}
void main()
{
	pagina *raiz=NULL;
	int x,min,s;
	int lea();
	void ins_b (pagina **raiz,int x,int *s);
	void listar1_b (pagina *p,int l);
	void retira_b (pagina **raiz, int x, int *s);
	printf ("Comienzo..\n");
	printf ("De llave\n");
	min = lea();
	while (min != 9999) {
		ins_b (&raiz, min, &s);
		if (s == 1)
			printf ("La llave ya existe\n");
		printf ("De llave\n");
		min = lea();
	}
	printf ("\n");
	listar1_b (raiz,0);
	getch();

	printf ("\nRetiros\n");
	printf ("De llave a retirar\n");
	x = lea();
	while (x != 9999) {
		retira_b (&raiz, x, &s);
		if (s == 0)
			printf ("La llave no existe\n");
		printf ("De llave a retirar\n");
		x = lea();
	}
	printf ("\n");
	listar1_b (raiz,0);
	getch();

}
/**
 * Method entry for static native methods:
 *   int java.util.zip.CRC32.update(int crc, int b)
 */
address TemplateInterpreterGenerator::generate_CRC32_update_entry() {
  if (UseCRC32Intrinsics) {
    address entry = __ pc();

    // rbx,: Method*
    // r13: senderSP must preserved for slow path, set SP to it on fast path
    // c_rarg0: scratch (rdi on non-Win64, rcx on Win64)
    // c_rarg1: scratch (rsi on non-Win64, rdx on Win64)

    Label slow_path;
    // If we need a safepoint check, generate full interpreter entry.
    ExternalAddress state(SafepointSynchronize::address_of_state());
    __ cmp32(ExternalAddress(SafepointSynchronize::address_of_state()),
             SafepointSynchronize::_not_synchronized);
    __ jcc(Assembler::notEqual, slow_path);

    // We don't generate local frame and don't align stack because
    // we call stub code and there is no safepoint on this path.

    // Load parameters
    const Register crc = rax;  // crc
    const Register val = c_rarg0;  // source java byte value
    const Register tbl = c_rarg1;  // scratch

    // Arguments are reversed on java expression stack
    __ movl(val, Address(rsp,   wordSize)); // byte value
    __ movl(crc, Address(rsp, 2*wordSize)); // Initial CRC

    __ lea(tbl, ExternalAddress(StubRoutines::crc_table_addr()));
    __ notl(crc); // ~crc
    __ update_byte_crc32(crc, val, tbl);
    __ notl(crc); // ~crc
    // result in rax

    // _areturn
    __ pop(rdi);                // get return address
    __ mov(rsp, r13);           // set sp to sender sp
    __ jmp(rdi);

    // generate a vanilla native entry as the slow path
    __ bind(slow_path);
    __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::native));
    return entry;
  }
  return NULL;
}
Beispiel #12
0
void MethodHandles::trace_method_handle(MacroAssembler* _masm, const char* adaptername) {
  if (!TraceMethodHandles)  return;
  BLOCK_COMMENT("trace_method_handle {");
  __ push(rax);
  __ lea(rax, Address(rsp, wordSize*6)); // entry_sp
  __ pusha();
  // arguments:
  __ push(rbp);               // interpreter frame pointer
  __ push(rsi);               // saved_sp
  __ push(rax);               // entry_sp
  __ push(rcx);               // mh
  __ push(rcx);
  __ movptr(Address(rsp, 0), (intptr_t) adaptername);
  __ call_VM_leaf(CAST_FROM_FN_PTR(address, trace_method_handle_stub), 5);
  __ popa();
  __ pop(rax);
  BLOCK_COMMENT("} trace_method_handle");
}
static void slow_call_thr_specific(MacroAssembler* _masm, Register thread) {
  // slow call to of thr_getspecific
  // int thr_getspecific(thread_key_t key, void **value);
  // Consider using pthread_getspecific instead.

  if (thread != rax) {
__    push(rax);
  }
__  push(0); // space for return value
__  push(rdi);
__  push(rsi);
__  lea(rsi, Address(rsp, 16)); // pass return value address
__  push(rdx);
__  push(rcx);
__  push(r8);
__  push(r9);
__  push(r10);
  // XXX
__  mov(r10, rsp);
__  andptr(rsp, -16);
__  push(r10);
__  push(r11);

__  movl(rdi, ThreadLocalStorage::thread_index());
__  call(RuntimeAddress(CAST_FROM_FN_PTR(address, thr_getspecific)));

__  pop(r11);
__  pop(rsp);
__  pop(r10);
__  pop(r9);
__  pop(r8);
__  pop(rcx);
__  pop(rdx);
__  pop(rsi);
__  pop(rdi);
__  pop(thread); // load return value
  if (thread != rax) {
__    pop(rax);
  }
}
Beispiel #14
0
main() {
	int ant,i,n,*p;
	printf ("Cuantos elementos desea Clasificar?\n");
	n = lea();
	printf ("\nDe el valor de los elementos\n");
	p = (int *) calloc (n,2);
	for (i = 1; i <= n; i++ ) {
		p[i] = random (1000);
		printf ("%d\n",p[i]);
	}
	S_INSERCION (p,n);
	ant = p[1];
	for (i=1; i<=n; i++) {
		if (ant > p[i]) {
			printf ("error");getch();
			exit(1);
		}
		else ant = p[i];
		printf ("%d ",p[i]);
	}
	getch();
}
Beispiel #15
0
main() {
	int ant,i,n,*p;
	printf ("Cuantos elementos desea Clasificar?\n");
	n = lea();
	printf ("\nDe el valor de los elementos\n");
	p = (int *) calloc (n,2);
	for (i = 0; i < n; i++ ) {
		p[i] = random (1000);
		printf ("%d\n",p[i]);
	}
	QUICK_SORT (p,0,n-1);
	ant = p[0];
	for (i=0; i<n; i++) {
		if (ant > p[i]) {
			printf ("error");getch();
			exit(1);
		}
		else ant = p[i];
		printf ("%d ",p[i]);
	}
	getch();
}
Beispiel #16
0
	/*
		double jit(double x);
		@note 32bit: x : [esp+4], return fp0
		      64bit: x [rcx](win), xmm0(gcc), return xmm0
	*/
	Jit()
		: negConst_(0x8000000000000000ULL)
		, constTblPos_(0)
		, regIdx_(-1)
#ifdef XBYAK32
		, varTbl_(eax)
		, tbl_(edx)
#elif defined(XBYAK64_WIN)
		, tbl_(rcx)
#else
		, tbl_(rdi)
#endif
	{
#ifdef XBYAK32
		lea(varTbl_, ptr[esp+4]);
#else
#ifdef XBYAK64_WIN
		movaps(ptr [rsp + 8], xm6); // save xm6, xm7
		movaps(ptr [rsp + 8 + 16], xm7);
#endif
		movaps(xm7, xm0); // save xm0
#endif
		mov(tbl_, (size_t)constTbl_);
	}
address JNI_FastGetField::generate_fast_get_float_field0(BasicType type) {
  const char *name;
  switch (type) {
    case T_FLOAT:  name = "jni_fast_GetFloatField";  break;
    case T_DOUBLE: name = "jni_fast_GetDoubleField"; break;
    default:       ShouldNotReachHere();
  }
  ResourceMark rm;
  BufferBlob* b = BufferBlob::create(name, BUFFER_SIZE*wordSize);
  address fast_entry = b->instructions_begin();
  CodeBuffer cbuf(fast_entry, b->instructions_size());
  MacroAssembler* masm = new MacroAssembler(&cbuf);

  Label slow_with_pop, slow;

  // stack layout:    offset from rsp (in words):
  //  return pc        0
  //  jni env          1
  //  obj              2
  //  jfieldID         3

  ExternalAddress counter(SafepointSynchronize::safepoint_counter_addr());

  __ mov32 (rcx, counter);
  __ testb (rcx, 1);
  __ jcc (Assembler::notZero, slow);
  if (os::is_MP()) {
    __ mov(rax, rcx);
    __ andptr(rax, 1);                         // rax, must end up 0
    __ movptr(rdx, Address(rsp, rax, Address::times_1, 2*wordSize));
                                              // obj, notice rax, is 0.
                                              // rdx is data dependent on rcx.
  } else {
    __ movptr(rdx, Address(rsp, 2*wordSize)); // obj
  }
  __ movptr(rax, Address(rsp, 3*wordSize));  // jfieldID
  __ movptr(rdx, Address(rdx, 0));           // *obj
  __ shrptr(rax, 2);                         // offset

  assert(count < LIST_CAPACITY, "LIST_CAPACITY too small");
  speculative_load_pclist[count] = __ pc();
  switch (type) {
#ifndef _LP64
    case T_FLOAT:  __ fld_s (Address(rdx, rax, Address::times_1)); break;
    case T_DOUBLE: __ fld_d (Address(rdx, rax, Address::times_1)); break;
#else
    case T_FLOAT:  __ movflt (xmm0, Address(robj, roffset, Address::times_1)); break;
    case T_DOUBLE: __ movdbl (xmm0, Address(robj, roffset, Address::times_1)); break;
#endif // _LP64
    default:       ShouldNotReachHere();
  }

  Address ca1;
  if (os::is_MP()) {
    __ fst_s (Address(rsp, -4));
    __ lea(rdx, counter);
    __ movl (rax, Address(rsp, -4));
    // garbage hi-order bits on 64bit are harmless.
    __ xorptr(rdx, rax);
    __ xorptr(rdx, rax);
    __ cmp32(rcx, Address(rdx, 0));
                                          // rax, ^ counter_addr ^ rax, = address
                                          // ca1 is data dependent on the field
                                          // access.
  } else {
    __ cmp32(rcx, counter);
  }
  __ jcc (Assembler::notEqual, slow_with_pop);

#ifndef _WINDOWS
  __ ret (0);
#else
  // __stdcall calling convention
  __ ret (3*wordSize);
#endif

  __ bind (slow_with_pop);
  // invalid load. pop FPU stack.
  __ fstp_d (0);

  slowcase_entry_pclist[count++] = __ pc();
  __ bind (slow);
  address slow_case_addr;
  switch (type) {
    case T_FLOAT:  slow_case_addr = jni_GetFloatField_addr();  break;
    case T_DOUBLE: slow_case_addr = jni_GetDoubleField_addr(); break;
    default:       ShouldNotReachHere();
  }
  // tail call
  __ jump (ExternalAddress(slow_case_addr));

  __ flush ();

#ifndef _WINDOWS
  return fast_entry;
#else
  switch (type) {
    case T_FLOAT:  jni_fast_GetFloatField_fp = (GetFloatField_t)fast_entry; break;
    case T_DOUBLE: jni_fast_GetDoubleField_fp = (GetDoubleField_t)fast_entry;
  }
  return os::win32::fast_jni_accessor_wrapper(type);
#endif
}
address JNI_FastGetField::generate_fast_get_long_field() {
  const char *name = "jni_fast_GetLongField";
  ResourceMark rm;
  BufferBlob* b = BufferBlob::create(name, BUFFER_SIZE*wordSize);
  address fast_entry = b->instructions_begin();
  CodeBuffer cbuf(fast_entry, b->instructions_size());
  MacroAssembler* masm = new MacroAssembler(&cbuf);

  Label slow;

  // stack layout:    offset from rsp (in words):
  //  old rsi          0
  //  return pc        1
  //  jni env          2
  //  obj              3
  //  jfieldID         4

  ExternalAddress counter(SafepointSynchronize::safepoint_counter_addr());

  __ push  (rsi);
  __ mov32 (rcx, counter);
  __ testb (rcx, 1);
  __ jcc (Assembler::notZero, slow);
  if (os::is_MP()) {
    __ mov(rax, rcx);
    __ andptr(rax, 1);                         // rax, must end up 0
    __ movptr(rdx, Address(rsp, rax, Address::times_1, 3*wordSize));
                                              // obj, notice rax, is 0.
                                              // rdx is data dependent on rcx.
  } else {
    __ movptr(rdx, Address(rsp, 3*wordSize));  // obj
  }
  __ movptr(rsi, Address(rsp, 4*wordSize));  // jfieldID
  __ movptr(rdx, Address(rdx, 0));           // *obj
  __ shrptr(rsi, 2);                         // offset

  assert(count < LIST_CAPACITY-1, "LIST_CAPACITY too small");
  speculative_load_pclist[count++] = __ pc();
  __ movptr(rax, Address(rdx, rsi, Address::times_1));
#ifndef _LP64
  speculative_load_pclist[count] = __ pc();
  __ movl(rdx, Address(rdx, rsi, Address::times_1, 4));
#endif // _LP64

  if (os::is_MP()) {
    __ lea(rsi, counter);
    __ xorptr(rsi, rdx);
    __ xorptr(rsi, rax);
    __ xorptr(rsi, rdx);
    __ xorptr(rsi, rax);
    __ cmp32(rcx, Address(rsi, 0));
    // ca1 is the same as ca because
    // rax, ^ rdx ^ counter_addr ^ rax, ^ rdx = address
    // ca1 is data dependent on both rax, and rdx.
  } else {
    __ cmp32(rcx, counter);
  }
  __ jcc (Assembler::notEqual, slow);

  __ pop (rsi);

#ifndef _WINDOWS
  __ ret (0);
#else
  // __stdcall calling convention
  __ ret (3*wordSize);
#endif

  slowcase_entry_pclist[count-1] = __ pc();
  slowcase_entry_pclist[count++] = __ pc();
  __ bind (slow);
  __ pop  (rsi);
  address slow_case_addr = jni_GetLongField_addr();;
  // tail call
  __ jump (ExternalAddress(slow_case_addr));

  __ flush ();

#ifndef _WINDOWS
  return fast_entry;
#else
  jni_fast_GetLongField_fp = (GetLongField_t)fast_entry;
  return os::win32::fast_jni_accessor_wrapper(T_LONG);
#endif
}
Beispiel #19
0
struct code encode(struct instruction instr)
{
    switch (instr.opcode) {
    case INSTR_ADD:
        return add(instr.optype, instr.source, instr.dest);
    case INSTR_NOT:
        return not(instr.optype, instr.source);
    case INSTR_MUL:
        return mul(instr.optype, instr.source);
    case INSTR_XOR:
        return xor(instr.optype, instr.source, instr.dest);
    case INSTR_DIV:
        return encode_div(instr.optype, instr.source);
    case INSTR_AND:
        return and(instr.optype, instr.source, instr.dest);
    case INSTR_OR:
        return or(instr.optype, instr.source, instr.dest);
    case INSTR_SHL:
        return shl(instr.optype, instr.source, instr.dest);
    case INSTR_SHR:
        return shr(instr.optype, instr.source, instr.dest);
    case INSTR_SAR:
        return sar(instr.optype, instr.source, instr.dest);
    case INSTR_CALL:
        return call(instr.optype, instr.source);
    case INSTR_CMP:
        return cmp(instr.optype, instr.source, instr.dest);
    case INSTR_MOV:
        return mov(instr.optype, instr.source, instr.dest);
    case INSTR_MOVSX:
        return movsx(instr.optype, instr.source, instr.dest);
    case INSTR_MOVZX:
        return movzx(instr.optype, instr.source, instr.dest);
    case INSTR_MOVAPS:
        return movaps(instr.optype, instr.source, instr.dest);
    case INSTR_PUSH:
        return push(instr.optype, instr.source);
    case INSTR_SUB:
        return sub(instr.optype, instr.source, instr.dest);
    case INSTR_LEA:
        return lea(instr.optype, instr.source, instr.dest);
    case INSTR_LEAVE:
        return leave();
    case INSTR_REP_MOVSQ:
        assert(instr.optype == OPT_NONE);
        return rep_movsq();
    case INSTR_RET:
        return ret();
    case INSTR_JMP:
        return jmp(instr.optype, instr.source);
    case INSTR_JA:
        return jcc(instr.optype, TEST_A, instr.source);
    case INSTR_JG:
        return jcc(instr.optype, TEST_G, instr.source);
    case INSTR_JZ:
        return jcc(instr.optype, TEST_Z, instr.source);
    case INSTR_JAE:
        return jcc(instr.optype, TEST_AE, instr.source);
    case INSTR_JGE:
        return jcc(instr.optype, TEST_GE, instr.source);
    case INSTR_SETZ:
        return setcc(instr.optype, TEST_Z, instr.source);
    case INSTR_SETA:
        return setcc(instr.optype, TEST_A, instr.source);
    case INSTR_SETG:
        return setcc(instr.optype, TEST_G, instr.source);
    case INSTR_SETAE:
        return setcc(instr.optype, TEST_AE, instr.source);
    case INSTR_SETGE:
        return setcc(instr.optype, TEST_GE, instr.source);
    case INSTR_TEST:
        return test(instr.optype, instr.source, instr.dest);
    default:
        return nop();
    }
}
Beispiel #20
0
	explicit Code(int mode, size_t size, void *p)
		: Xbyak::CodeGenerator(size, p)
	{
		inLocalLabel();
#ifdef XBYAK64
		const Xbyak::Reg64& a = rax;
		const Xbyak::Reg64& c = rcx;
#ifdef XBYAK64_WIN
		mov(rax, rcx);
#else
		mov(rax, rdi);
#endif
#else
		const Xbyak::Reg32& a = eax;
		const Xbyak::Reg32& c = ecx;
		mov(a, ptr [esp + 4]);
#endif

		switch (mode) {
		case 0:
			mov(c, ".jmp_table");
			lea(c, ptr [c + a * 8]);
			jmp(c);
		align(8);
			L(".jmp_table");
			mov(a, expectTbl[0]);
			ret();
		align(8);
			mov(a, expectTbl[1]);
			ret();
		align(8);
			mov(a, expectTbl[2]);
			ret();
			break;

		case 1:
			/*
				the label for putL is defined when called
			*/
			mov(c, ".jmp_table");
			jmp(ptr [c + a * (int)sizeof(size_t)]);
		L(".label1");
			mov(a, expectTbl[0]);
			jmp(".end");
		L(".label2");
			mov(a, expectTbl[1]);
			jmp(".end");
		L(".label3");
			mov(a, expectTbl[2]);
			jmp(".end");
		L(".end");
			ret();
			ud2();

			align(8);
		L(".jmp_table");
		putL(".label1");
		putL(".label2");
		putL(".label3");
			break;

		case 2:
			/*
				the label for putL is not defined when called
			*/
			jmp(".in");
			ud2();
			align(8);
		L(".jmp_table");
		putL(".label1");
		putL(".label2");
		putL(".label3");
		L(".in");
			mov(c, ".jmp_table");
			jmp(ptr [c + a * (int)sizeof(size_t)]);
		L(".label1");
			mov(a, expectTbl[0]);
			jmp(".end");
		L(".label2");
			mov(a, expectTbl[1]);
			jmp(".end");
		L(".label3");
			mov(a, expectTbl[2]);
			jmp(".end");
		L(".end");
			ret();
			break;
		}
		outLocalLabel();
	}
void MacroAssembler::fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp) {
  Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
  Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2;
  Label L_2TAG_PACKET_8_0_2, L_2TAG_PACKET_9_0_2, L_2TAG_PACKET_10_0_2, L_2TAG_PACKET_11_0_2;
  Label L_2TAG_PACKET_12_0_2, L_2TAG_PACKET_13_0_2, B1_3, B1_5, start;

  assert_different_registers(tmp, eax, ecx, edx);
  jmp(start);
  address static_const_table = (address)_static_const_table;

  bind(start);
  subl(rsp, 120);
  movl(Address(rsp, 64), tmp);
  lea(tmp, ExternalAddress(static_const_table));
  movdqu(xmm0, Address(rsp, 128));
  unpcklpd(xmm0, xmm0);
  movdqu(xmm1, Address(tmp, 64));          // 0x652b82feUL, 0x40571547UL, 0x652b82feUL, 0x40571547UL
  movdqu(xmm6, Address(tmp, 48));          // 0x00000000UL, 0x43380000UL, 0x00000000UL, 0x43380000UL
  movdqu(xmm2, Address(tmp, 80));          // 0xfefa0000UL, 0x3f862e42UL, 0xfefa0000UL, 0x3f862e42UL
  movdqu(xmm3, Address(tmp, 96));          // 0xbc9e3b3aUL, 0x3d1cf79aUL, 0xbc9e3b3aUL, 0x3d1cf79aUL
  pextrw(eax, xmm0, 3);
  andl(eax, 32767);
  movl(edx, 16527);
  subl(edx, eax);
  subl(eax, 15504);
  orl(edx, eax);
  cmpl(edx, INT_MIN);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2);
  mulpd(xmm1, xmm0);
  addpd(xmm1, xmm6);
  movapd(xmm7, xmm1);
  subpd(xmm1, xmm6);
  mulpd(xmm2, xmm1);
  movdqu(xmm4, Address(tmp, 128));         // 0xe3289860UL, 0x3f56c15cUL, 0x555b9e25UL, 0x3fa55555UL
  mulpd(xmm3, xmm1);
  movdqu(xmm5, Address(tmp, 144));         // 0xc090cf0fUL, 0x3f811115UL, 0x55548ba1UL, 0x3fc55555UL
  subpd(xmm0, xmm2);
  movdl(eax, xmm7);
  movl(ecx, eax);
  andl(ecx, 63);
  shll(ecx, 4);
  sarl(eax, 6);
  movl(edx, eax);
  movdqu(xmm6, Address(tmp, 16));          // 0xffffffc0UL, 0x00000000UL, 0xffffffc0UL, 0x00000000UL
  pand(xmm7, xmm6);
  movdqu(xmm6, Address(tmp, 32));          // 0x0000ffc0UL, 0x00000000UL, 0x0000ffc0UL, 0x00000000UL
  paddq(xmm7, xmm6);
  psllq(xmm7, 46);
  subpd(xmm0, xmm3);
  movdqu(xmm2, Address(tmp, ecx, Address::times_1, 160));
  mulpd(xmm4, xmm0);
  movapd(xmm6, xmm0);
  movapd(xmm1, xmm0);
  mulpd(xmm6, xmm6);
  mulpd(xmm0, xmm6);
  addpd(xmm5, xmm4);
  mulsd(xmm0, xmm6);
  mulpd(xmm6, Address(tmp, 112));          // 0xfffffffeUL, 0x3fdfffffUL, 0xfffffffeUL, 0x3fdfffffUL
  addsd(xmm1, xmm2);
  unpckhpd(xmm2, xmm2);
  mulpd(xmm0, xmm5);
  addsd(xmm1, xmm0);
  por(xmm2, xmm7);
  unpckhpd(xmm0, xmm0);
  addsd(xmm0, xmm1);
  addsd(xmm0, xmm6);
  addl(edx, 894);
  cmpl(edx, 1916);
  jcc (Assembler::above, L_2TAG_PACKET_1_0_2);
  mulsd(xmm0, xmm2);
  addsd(xmm0, xmm2);
  jmp(L_2TAG_PACKET_2_0_2);

  bind(L_2TAG_PACKET_1_0_2);
  fnstcw(Address(rsp, 24));
  movzwl(edx, Address(rsp, 24));
  orl(edx, 768);
  movw(Address(rsp, 28), edx);
  fldcw(Address(rsp, 28));
  movl(edx, eax);
  sarl(eax, 1);
  subl(edx, eax);
  movdqu(xmm6, Address(tmp, 0));           // 0x00000000UL, 0xfff00000UL, 0x00000000UL, 0xfff00000UL
  pandn(xmm6, xmm2);
  addl(eax, 1023);
  movdl(xmm3, eax);
  psllq(xmm3, 52);
  por(xmm6, xmm3);
  addl(edx, 1023);
  movdl(xmm4, edx);
  psllq(xmm4, 52);
  movsd(Address(rsp, 8), xmm0);
  fld_d(Address(rsp, 8));
  movsd(Address(rsp, 16), xmm6);
  fld_d(Address(rsp, 16));
  fmula(1);
  faddp(1);
  movsd(Address(rsp, 8), xmm4);
  fld_d(Address(rsp, 8));
  fmulp(1);
  fstp_d(Address(rsp, 8));
  movsd(xmm0,Address(rsp, 8));
  fldcw(Address(rsp, 24));
  pextrw(ecx, xmm0, 3);
  andl(ecx, 32752);
  cmpl(ecx, 32752);
  jcc(Assembler::greaterEqual, L_2TAG_PACKET_3_0_2);
  cmpl(ecx, 0);
  jcc(Assembler::equal, L_2TAG_PACKET_4_0_2);
  jmp(L_2TAG_PACKET_2_0_2);
  cmpl(ecx, INT_MIN);
  jcc(Assembler::less, L_2TAG_PACKET_3_0_2);
  cmpl(ecx, -1064950997);
  jcc(Assembler::less, L_2TAG_PACKET_2_0_2);
  jcc(Assembler::greater, L_2TAG_PACKET_4_0_2);
  movl(edx, Address(rsp, 128));
  cmpl(edx ,-17155601);
  jcc(Assembler::less, L_2TAG_PACKET_2_0_2);
  jmp(L_2TAG_PACKET_4_0_2);

  bind(L_2TAG_PACKET_3_0_2);
  movl(edx, 14);
  jmp(L_2TAG_PACKET_5_0_2);

  bind(L_2TAG_PACKET_4_0_2);
  movl(edx, 15);

  bind(L_2TAG_PACKET_5_0_2);
  movsd(Address(rsp, 0), xmm0);
  movsd(xmm0, Address(rsp, 128));
  fld_d(Address(rsp, 0));
  jmp(L_2TAG_PACKET_6_0_2);

  bind(L_2TAG_PACKET_7_0_2);
  cmpl(eax, 2146435072);
  jcc(Assembler::greaterEqual, L_2TAG_PACKET_8_0_2);
  movl(eax, Address(rsp, 132));
  cmpl(eax, INT_MIN);
  jcc(Assembler::greaterEqual, L_2TAG_PACKET_9_0_2);
  movsd(xmm0, Address(tmp, 1208));         // 0xffffffffUL, 0x7fefffffUL
  mulsd(xmm0, xmm0);
  movl(edx, 14);
  jmp(L_2TAG_PACKET_5_0_2);

  bind(L_2TAG_PACKET_9_0_2);
  movsd(xmm0, Address(tmp, 1216));
  mulsd(xmm0, xmm0);
  movl(edx, 15);
  jmp(L_2TAG_PACKET_5_0_2);

  bind(L_2TAG_PACKET_8_0_2);
  movl(edx, Address(rsp, 128));
  cmpl(eax, 2146435072);
  jcc(Assembler::above, L_2TAG_PACKET_10_0_2);
  cmpl(edx, 0);
  jcc(Assembler::notEqual, L_2TAG_PACKET_10_0_2);
  movl(eax, Address(rsp, 132));
  cmpl(eax, 2146435072);
  jcc(Assembler::notEqual, L_2TAG_PACKET_11_0_2);
  movsd(xmm0, Address(tmp, 1192));         // 0x00000000UL, 0x7ff00000UL
  jmp(L_2TAG_PACKET_2_0_2);

  bind(L_2TAG_PACKET_11_0_2);
  movsd(xmm0, Address(tmp, 1200));         // 0x00000000UL, 0x00000000UL
  jmp(L_2TAG_PACKET_2_0_2);

  bind(L_2TAG_PACKET_10_0_2);
  movsd(xmm0, Address(rsp, 128));
  addsd(xmm0, xmm0);
  jmp(L_2TAG_PACKET_2_0_2);

  bind(L_2TAG_PACKET_0_0_2);
  movl(eax, Address(rsp, 132));
  andl(eax, 2147483647);
  cmpl(eax, 1083179008);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_7_0_2);
  movsd(xmm0, Address(rsp, 128));
  addsd(xmm0, Address(tmp, 1184));         // 0x00000000UL, 0x3ff00000UL
  jmp(L_2TAG_PACKET_2_0_2);

  bind(L_2TAG_PACKET_2_0_2);
  movsd(Address(rsp, 48), xmm0);
  fld_d(Address(rsp, 48));

  bind(L_2TAG_PACKET_6_0_2);
  movl(tmp, Address(rsp, 64));
}
void GSDrawScanlineCodeGenerator::Init()
{
	// int skip = left & 3;

	mov(rbx, rdx);
	and(rdx, 3);

	// left -= skip;

	sub(rbx, rdx);

	// int steps = pixels + skip - 4;

	lea(rcx, ptr[rcx + rdx - 4]);

	// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];

	shl(rdx, 4);

	vmovdqa(xmm15, ptr[rdx + r10]);

	mov(rax, rcx);
	sar(rax, 63);
	and(rax, rcx);
	shl(rax, 4);

	vpor(xmm15, ptr[rax + r10 + 7 * 16]);

	// GSVector2i* fza_base = &m_local.gd->fzbr[top];

	mov(rax, (size_t)m_local.gd->fzbr);
	lea(rsi, ptr[rax + r8 * 8]);

	// GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2];

	mov(rax, (size_t)m_local.gd->fzbc);
	lea(rdi, ptr[rax + rbx * 2]);

	if(!m_sel.sprite && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip))
	{
		// edx = &m_local.d[skip]

		lea(rdx, ptr[rdx * 8 + r11 + offsetof(GSScanlineLocalData, d)]);
	}

	if(!m_sel.sprite)
	{
		if(m_sel.fwrite && m_sel.fge || m_sel.zb)
		{
			vmovaps(xmm0, ptr[r9 + offsetof(GSVertexSW, p)]); // v.p

			if(m_sel.fwrite && m_sel.fge)
			{
				// f = GSVector4i(vp).zzzzh().zzzz().add16(m_local.d[skip].f);

				vcvttps2dq(xmm9, xmm0);
				vpshufhw(xmm9, xmm9, _MM_SHUFFLE(2, 2, 2, 2));
				vpshufd(xmm9, xmm9, _MM_SHUFFLE(2, 2, 2, 2));
				vpaddw(xmm9, ptr[rdx + 16 * 6]);
			}

			if(m_sel.zb)
			{
				// z = vp.zzzz() + m_local.d[skip].z;

				vshufps(xmm8, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
				vaddps(xmm8, ptr[rdx]);
			}
		}
	}
	else
	{
		if(m_sel.ztest)
		{
			vmovdqa(xmm8, ptr[r11 + offsetof(GSScanlineLocalData, p.z)]);
		}
	}

	if(m_sel.fb)
	{
		if(m_sel.edge || m_sel.tfx != TFX_NONE)
		{
			vmovaps(xmm0, ptr[r9 + offsetof(GSVertexSW, t)]); // v.t
		}

		if(m_sel.edge)
		{
			vpshufhw(xmm1, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
			vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
			vpsrlw(xmm1, 9);

			vmovdqa(ptr[r11 + offsetof(GSScanlineLocalData, temp.cov)], xmm1);
		}

		if(m_sel.tfx != TFX_NONE)
		{
			if(m_sel.fst)
			{
				// GSVector4i vti(vt);

				vcvttps2dq(xmm0, xmm0);

				// s = vti.xxxx() + m_local.d[skip].s;
				// t = vti.yyyy(); if(!sprite) t += m_local.d[skip].t;

				vpshufd(xmm10, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
				vpshufd(xmm11, xmm0, _MM_SHUFFLE(1, 1, 1, 1));

				vpaddd(xmm10, ptr[rdx + offsetof(GSScanlineLocalData::skip, s)]);

				if(!m_sel.sprite || m_sel.mmin)
				{
					vpaddd(xmm11, ptr[rdx + offsetof(GSScanlineLocalData::skip, t)]);
				}
				else
				{
					if(m_sel.ltf)
					{
						vpshuflw(xmm6, xmm11, _MM_SHUFFLE(2, 2, 0, 0));
						vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));
						vpsrlw(xmm6, 1);
					}
				}
			}
			else
			{
				// s = vt.xxxx() + m_local.d[skip].s;
				// t = vt.yyyy() + m_local.d[skip].t;
				// q = vt.zzzz() + m_local.d[skip].q;

				vshufps(xmm10, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
				vshufps(xmm11, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
				vshufps(xmm12, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));

				vaddps(xmm10, ptr[rdx + offsetof(GSScanlineLocalData::skip, s)]);
				vaddps(xmm11, ptr[rdx + offsetof(GSScanlineLocalData::skip, t)]);
				vaddps(xmm12, ptr[rdx + offsetof(GSScanlineLocalData::skip, q)]);
			}
		}

		if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
		{
			if(m_sel.iip)
			{
				// GSVector4i vc = GSVector4i(v.c);

				vcvttps2dq(xmm0, ptr[r9 + offsetof(GSVertexSW, c)]); // v.c

				// vc = vc.upl16(vc.zwxy());

				vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
				vpunpcklwd(xmm0, xmm1);

				// rb = vc.xxxx().add16(m_local.d[skip].rb);
				// ga = vc.zzzz().add16(m_local.d[skip].ga);

				vpshufd(xmm13, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
				vpshufd(xmm14, xmm0, _MM_SHUFFLE(2, 2, 2, 2));

				vpaddw(xmm13, ptr[rdx + offsetof(GSScanlineLocalData::skip, rb)]);
				vpaddw(xmm14, ptr[rdx + offsetof(GSScanlineLocalData::skip, ga)]);
			}
			else
			{
				vmovdqa(xmm13, ptr[r11 + offsetof(GSScanlineLocalData, c.rb)]);
				vmovdqa(xmm14, ptr[r11 + offsetof(GSScanlineLocalData, c.ga)]);
			}
		}
	}
}
void MacroAssembler::fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp1, Register tmp2) {
  Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
  Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2;
  Label L_2TAG_PACKET_8_0_2;
  Label L_2TAG_PACKET_12_0_2, L_2TAG_PACKET_13_0_2, B1_3, B1_5, start;

  assert_different_registers(tmp1, tmp2, eax, ecx, edx);
  jmp(start);
  address L_tbl = (address)_L_tbl;
  address log2 = (address)_log2;
  address coeff = (address)_coeff;

  bind(start);
  subq(rsp, 24);
  movsd(Address(rsp, 0), xmm0);
  mov64(rax, 0x3ff0000000000000);
  movdq(xmm2, rax);
  mov64(rdx, 0x77f0000000000000);
  movdq(xmm3, rdx);
  movl(ecx, 32768);
  movdl(xmm4, rcx);
  mov64(tmp1, 0xffffe00000000000);
  movdq(xmm5, tmp1);
  movdqu(xmm1, xmm0);
  pextrw(eax, xmm0, 3);
  por(xmm0, xmm2);
  movl(ecx, 16352);
  psrlq(xmm0, 27);
  lea(tmp2, ExternalAddress(L_tbl));
  psrld(xmm0, 2);
  rcpps(xmm0, xmm0);
  psllq(xmm1, 12);
  pshufd(xmm6, xmm5, 228);
  psrlq(xmm1, 12);
  subl(eax, 16);
  cmpl(eax, 32736);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_0_0_2);

  bind(L_2TAG_PACKET_1_0_2);
  paddd(xmm0, xmm4);
  por(xmm1, xmm3);
  movdl(edx, xmm0);
  psllq(xmm0, 29);
  pand(xmm5, xmm1);
  pand(xmm0, xmm6);
  subsd(xmm1, xmm5);
  mulpd(xmm5, xmm0);
  andl(eax, 32752);
  subl(eax, ecx);
  cvtsi2sdl(xmm7, eax);
  mulsd(xmm1, xmm0);
  movq(xmm6, ExternalAddress(log2));       // 0xfefa3800UL, 0x3fa62e42UL
  movdqu(xmm3, ExternalAddress(coeff));    // 0x92492492UL, 0x3fc24924UL, 0x00000000UL, 0xbfd00000UL
  subsd(xmm5, xmm2);
  andl(edx, 16711680);
  shrl(edx, 12);
  movdqu(xmm0, Address(tmp2, edx));
  movdqu(xmm4, ExternalAddress(16 + coeff)); // 0x3d6fb175UL, 0xbfc5555eUL, 0x55555555UL, 0x3fd55555UL
  addsd(xmm1, xmm5);
  movdqu(xmm2, ExternalAddress(32 + coeff)); // 0x9999999aUL, 0x3fc99999UL, 0x00000000UL, 0xbfe00000UL
  mulsd(xmm6, xmm7);
  movddup(xmm5, xmm1);
  mulsd(xmm7, ExternalAddress(8 + log2));    // 0x93c76730UL, 0x3ceef357UL
  mulsd(xmm3, xmm1);
  addsd(xmm0, xmm6);
  mulpd(xmm4, xmm5);
  mulpd(xmm5, xmm5);
  movddup(xmm6, xmm0);
  addsd(xmm0, xmm1);
  addpd(xmm4, xmm2);
  mulpd(xmm3, xmm5);
  subsd(xmm6, xmm0);
  mulsd(xmm4, xmm1);
  pshufd(xmm2, xmm0, 238);
  addsd(xmm1, xmm6);
  mulsd(xmm5, xmm5);
  addsd(xmm7, xmm2);
  addpd(xmm4, xmm3);
  addsd(xmm1, xmm7);
  mulpd(xmm4, xmm5);
  addsd(xmm1, xmm4);
  pshufd(xmm5, xmm4, 238);
  addsd(xmm1, xmm5);
  addsd(xmm0, xmm1);
  jmp(B1_5);

  bind(L_2TAG_PACKET_0_0_2);
  movq(xmm0, Address(rsp, 0));
  movq(xmm1, Address(rsp, 0));
  addl(eax, 16);
  cmpl(eax, 32768);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_2_0_2);
  cmpl(eax, 16);
  jcc(Assembler::below, L_2TAG_PACKET_3_0_2);

  bind(L_2TAG_PACKET_4_0_2);
  addsd(xmm0, xmm0);
  jmp(B1_5);

  bind(L_2TAG_PACKET_5_0_2);
  jcc(Assembler::above, L_2TAG_PACKET_4_0_2);
  cmpl(edx, 0);
  jcc(Assembler::above, L_2TAG_PACKET_4_0_2);
  jmp(L_2TAG_PACKET_6_0_2);

  bind(L_2TAG_PACKET_3_0_2);
  xorpd(xmm1, xmm1);
  addsd(xmm1, xmm0);
  movdl(edx, xmm1);
  psrlq(xmm1, 32);
  movdl(ecx, xmm1);
  orl(edx, ecx);
  cmpl(edx, 0);
  jcc(Assembler::equal, L_2TAG_PACKET_7_0_2);
  xorpd(xmm1, xmm1);
  movl(eax, 18416);
  pinsrw(xmm1, eax, 3);
  mulsd(xmm0, xmm1);
  movdqu(xmm1, xmm0);
  pextrw(eax, xmm0, 3);
  por(xmm0, xmm2);
  psrlq(xmm0, 27);
  movl(ecx, 18416);
  psrld(xmm0, 2);
  rcpps(xmm0, xmm0);
  psllq(xmm1, 12);
  pshufd(xmm6, xmm5, 228);
  psrlq(xmm1, 12);
  jmp(L_2TAG_PACKET_1_0_2);

  bind(L_2TAG_PACKET_2_0_2);
  movdl(edx, xmm1);
  psrlq(xmm1, 32);
  movdl(ecx, xmm1);
  addl(ecx, ecx);
  cmpl(ecx, -2097152);
  jcc(Assembler::aboveEqual, L_2TAG_PACKET_5_0_2);
  orl(edx, ecx);
  cmpl(edx, 0);
  jcc(Assembler::equal, L_2TAG_PACKET_7_0_2);

  bind(L_2TAG_PACKET_6_0_2);
  xorpd(xmm1, xmm1);
  xorpd(xmm0, xmm0);
  movl(eax, 32752);
  pinsrw(xmm1, eax, 3);
  mulsd(xmm0, xmm1);
  movl(Address(rsp, 16), 3);
  jmp(L_2TAG_PACKET_8_0_2);
  bind(L_2TAG_PACKET_7_0_2);
  xorpd(xmm1, xmm1);
  xorpd(xmm0, xmm0);
  movl(eax, 49136);
  pinsrw(xmm0, eax, 3);
  divsd(xmm0, xmm1);
  movl(Address(rsp, 16), 2);

  bind(L_2TAG_PACKET_8_0_2);
  movq(Address(rsp, 8), xmm0);

  bind(B1_3);
  movq(xmm0, Address(rsp, 8));

  bind(B1_5);
  addq(rsp, 24);
}
VtableStub* VtableStubs::create_vtable_stub(int vtable_index) {
  const int aarch64_code_length = VtableStub::pd_code_size_limit(true);
  VtableStub* s = new(aarch64_code_length) VtableStub(true, vtable_index);
  ResourceMark rm;
  CodeBuffer cb(s->entry_point(), aarch64_code_length);
  MacroAssembler* masm = new MacroAssembler(&cb);

#ifndef PRODUCT
  if (CountCompiledCalls) {
    __ lea(r19, ExternalAddress((address) SharedRuntime::nof_megamorphic_calls_addr()));
    __ incrementw(Address(r19));
  }
#endif

  // get receiver (need to skip return address on top of stack)
  assert(VtableStub::receiver_location() == j_rarg0->as_VMReg(), "receiver expected in j_rarg0");

  // get receiver klass
  address npe_addr = __ pc();
  __ load_klass(r19, j_rarg0);

#ifndef PRODUCT
  if (DebugVtables) {
    Label L;
    // check offset vs vtable length
    __ ldrw(rscratch1, Address(r19, Klass::vtable_length_offset()));
    __ cmpw(rscratch1, vtable_index * vtableEntry::size());
    __ br(Assembler::GT, L);
    __ enter();
    __ mov(r2, vtable_index);
    __ call_VM(noreg,
               CAST_FROM_FN_PTR(address, bad_compiled_vtable_index), j_rarg0, r2);
    __ leave();
    __ bind(L);
  }
#endif // PRODUCT

  __ lookup_virtual_method(r19, vtable_index, rmethod);

  if (DebugVtables) {
    Label L;
    __ cbz(rmethod, L);
    __ ldr(rscratch1, Address(rmethod, Method::from_compiled_offset()));
    __ cbnz(rscratch1, L);
    __ stop("Vtable entry is NULL");
    __ bind(L);
  }
  // r0: receiver klass
  // rmethod: Method*
  // r2: receiver
  address ame_addr = __ pc();
  __ ldr(rscratch1, Address(rmethod, Method::from_compiled_offset()));
  __ br(rscratch1);

  __ flush();

  if (PrintMiscellaneous && (WizardMode || Verbose)) {
    tty->print_cr("vtable #%d at " PTR_FORMAT "[%d] left over: %d",
                  vtable_index, p2i(s->entry_point()),
                  (int)(s->code_end() - s->entry_point()),
                  (int)(s->code_end() - __ pc()));
  }
  guarantee(__ pc() <= s->code_end(), "overflowed buffer");

  s->set_exception_points(npe_addr, ame_addr);
  return s;
}
VtableStub* VtableStubs::create_itable_stub(int itable_index) {
  // Note well: pd_code_size_limit is the absolute minimum we can get
  // away with.  If you add code here, bump the code stub size
  // returned by pd_code_size_limit!
  const int code_length = VtableStub::pd_code_size_limit(false);
  VtableStub* s = new(code_length) VtableStub(false, itable_index);
  ResourceMark rm;
  CodeBuffer cb(s->entry_point(), code_length);
  MacroAssembler* masm = new MacroAssembler(&cb);

#ifndef PRODUCT
  if (CountCompiledCalls) {
    __ lea(r10, ExternalAddress((address) SharedRuntime::nof_megamorphic_calls_addr()));
    __ incrementw(Address(r10));
  }
#endif

  // Entry arguments:
  //  rscratch2: Interface
  //  j_rarg0: Receiver

  // Free registers (non-args) are r0 (interface), rmethod

  // get receiver (need to skip return address on top of stack)

  assert(VtableStub::receiver_location() == j_rarg0->as_VMReg(), "receiver expected in j_rarg0");
  // get receiver klass (also an implicit null-check)
  address npe_addr = __ pc();

  // Most registers are in use; we'll use r0, rmethod, r10, r11
  __ load_klass(r10, j_rarg0);

  Label throw_icce;

  // Get Method* and entrypoint for compiler
  __ lookup_interface_method(// inputs: rec. class, interface, itable index
                             r10, rscratch2, itable_index,
                             // outputs: method, scan temp. reg
                             rmethod, r11,
                             throw_icce);

  // method (rmethod): Method*
  // j_rarg0: receiver

#ifdef ASSERT
  if (DebugVtables) {
    Label L2;
    __ cbz(rmethod, L2);
    __ ldr(rscratch1, Address(rmethod, Method::from_compiled_offset()));
    __ cbnz(rscratch1, L2);
    __ stop("compiler entrypoint is null");
    __ bind(L2);
  }
#endif // ASSERT

  // rmethod: Method*
  // j_rarg0: receiver
  address ame_addr = __ pc();
  __ ldr(rscratch1, Address(rmethod, Method::from_compiled_offset()));
  __ br(rscratch1);

  __ bind(throw_icce);
  __ far_jump(RuntimeAddress(StubRoutines::throw_IncompatibleClassChangeError_entry()));

  __ flush();

  if (PrintMiscellaneous && (WizardMode || Verbose)) {
    tty->print_cr("itable #%d at " PTR_FORMAT "[%d] left over: %d",
                  itable_index, p2i(s->entry_point()),
                  (int)(s->code_end() - s->entry_point()),
                  (int)(s->code_end() - __ pc()));
  }
  guarantee(__ pc() <= s->code_end(), "overflowed buffer");

  s->set_exception_points(npe_addr, ame_addr);
  return s;
}
address JNI_FastGetField::generate_fast_get_int_field0(BasicType type) {
  const char *name;
  switch (type) {
    case T_BOOLEAN: name = "jni_fast_GetBooleanField"; break;
    case T_BYTE:    name = "jni_fast_GetByteField";    break;
    case T_CHAR:    name = "jni_fast_GetCharField";    break;
    case T_SHORT:   name = "jni_fast_GetShortField";   break;
    case T_INT:     name = "jni_fast_GetIntField";     break;
    default:        ShouldNotReachHere();
  }
  ResourceMark rm;
  BufferBlob* b = BufferBlob::create(name, BUFFER_SIZE*wordSize);
  address fast_entry = b->instructions_begin();
  CodeBuffer cbuf(fast_entry, b->instructions_size());
  MacroAssembler* masm = new MacroAssembler(&cbuf);

  Label slow;

  // stack layout:    offset from rsp (in words):
  //  return pc        0
  //  jni env          1
  //  obj              2
  //  jfieldID         3

  ExternalAddress counter(SafepointSynchronize::safepoint_counter_addr());
  __ mov32 (rcx, counter);
  __ testb (rcx, 1);
  __ jcc (Assembler::notZero, slow);
  if (os::is_MP()) {
    __ mov(rax, rcx);
    __ andptr(rax, 1);                         // rax, must end up 0
    __ movptr(rdx, Address(rsp, rax, Address::times_1, 2*wordSize));
                                              // obj, notice rax, is 0.
                                              // rdx is data dependent on rcx.
  } else {
    __ movptr (rdx, Address(rsp, 2*wordSize));  // obj
  }
  __ movptr(rax, Address(rsp, 3*wordSize));  // jfieldID
  __ movptr(rdx, Address(rdx, 0));           // *obj
  __ shrptr (rax, 2);                         // offset

  assert(count < LIST_CAPACITY, "LIST_CAPACITY too small");
  speculative_load_pclist[count] = __ pc();
  switch (type) {
    case T_BOOLEAN: __ movzbl (rax, Address(rdx, rax, Address::times_1)); break;
    case T_BYTE:    __ movsbl (rax, Address(rdx, rax, Address::times_1)); break;
    case T_CHAR:    __ movzwl (rax, Address(rdx, rax, Address::times_1)); break;
    case T_SHORT:   __ movswl (rax, Address(rdx, rax, Address::times_1)); break;
    case T_INT:     __ movl   (rax, Address(rdx, rax, Address::times_1)); break;
    default:        ShouldNotReachHere();
  }

  Address ca1;
  if (os::is_MP()) {
    __ lea(rdx, counter);
    __ xorptr(rdx, rax);
    __ xorptr(rdx, rax);
    __ cmp32(rcx, Address(rdx, 0));
    // ca1 is the same as ca because
    // rax, ^ counter_addr ^ rax, = address
    // ca1 is data dependent on rax,.
  } else {
    __ cmp32(rcx, counter);
  }
  __ jcc (Assembler::notEqual, slow);

#ifndef _WINDOWS
  __ ret (0);
#else
  // __stdcall calling convention
  __ ret (3*wordSize);
#endif

  slowcase_entry_pclist[count++] = __ pc();
  __ bind (slow);
  address slow_case_addr;
  switch (type) {
    case T_BOOLEAN: slow_case_addr = jni_GetBooleanField_addr(); break;
    case T_BYTE:    slow_case_addr = jni_GetByteField_addr();    break;
    case T_CHAR:    slow_case_addr = jni_GetCharField_addr();    break;
    case T_SHORT:   slow_case_addr = jni_GetShortField_addr();   break;
    case T_INT:     slow_case_addr = jni_GetIntField_addr();
  }
  // tail call
  __ jump (ExternalAddress(slow_case_addr));

  __ flush ();

#ifndef _WINDOWS
  return fast_entry;
#else
  switch (type) {
    case T_BOOLEAN: jni_fast_GetBooleanField_fp = (GetBooleanField_t)fast_entry; break;
    case T_BYTE:    jni_fast_GetByteField_fp = (GetByteField_t)fast_entry; break;
    case T_CHAR:    jni_fast_GetCharField_fp = (GetCharField_t)fast_entry; break;
    case T_SHORT:   jni_fast_GetShortField_fp = (GetShortField_t)fast_entry; break;
    case T_INT:     jni_fast_GetIntField_fp = (GetIntField_t)fast_entry;
  }
  return os::win32::fast_jni_accessor_wrapper(type);
#endif
}
Beispiel #27
0
static char* gen_invoke_common_managed_func(char* stub) {
    // Defines stack alignment on managed function enter.
    const I_32 STACK_ALIGNMENT = MANAGED_STACK_ALIGNMENT;
    const I_32 STACK_ALIGNMENT_MASK = ~(STACK_ALIGNMENT - 1);
    const char * LOOP_BEGIN = "loop_begin";
    const char * LOOP_END = "loop_end";

    // [ebp + 8] - args
    // [ebp + 12] - size
    // [ebp + 16] - func
    const I_32 STACK_ARGS_OFFSET = 8;
    const I_32 STACK_NARGS_OFFSET = 12;
    const I_32 STACK_FUNC_OFFSET = 16;
    const I_32 STACK_CALLEE_SAVED_OFFSET = -12;

    tl::MemoryPool pool;
    LilCguLabelAddresses labels(&pool, stub);

    // Initialize ebp-based stack frame.
    stub = push(stub, ebp_opnd);
    stub = mov(stub, ebp_opnd, esp_opnd);

    // Preserve callee-saved registers.
    stub = push(stub, ebx_opnd);
    stub = push(stub, esi_opnd);
    stub = push(stub, edi_opnd);

    // Load an array of arguments ('args') and its size from the stack.
    stub = mov(stub, eax_opnd, M_Base_Opnd(ebp_reg, STACK_ARGS_OFFSET));
    stub = mov(stub, ecx_opnd, M_Base_Opnd(ebp_reg, STACK_NARGS_OFFSET));


    // Align memory stack.
    stub = lea(stub, ebx_opnd, M_Index_Opnd(n_reg, ecx_reg, 4, 4));
    stub = mov(stub, esi_opnd, ebx_opnd);
    stub = neg(stub, esi_opnd);
    stub = alu(stub, add_opc, esi_opnd, esp_opnd);
    stub = alu(stub, and_opc, esi_opnd, Imm_Opnd(size_32, STACK_ALIGNMENT_MASK));
    stub = alu(stub, add_opc, ebx_opnd, esi_opnd);
    stub = mov(stub, esp_opnd, ebx_opnd);

    // Load a pointer to the last argument of 'args' array.
    stub = lea(stub, eax_opnd, M_Index_Opnd(eax_reg, ecx_reg, -4, 4));
    stub = alu(stub, sub_opc, eax_opnd, esp_opnd);
    stub = alu(stub, or_opc, ecx_opnd, ecx_opnd);
    stub = branch8(stub, Condition_Z, Imm_Opnd(size_8, 0));
    labels.add_patch_to_label(LOOP_END, stub - 1, LPT_Rel8);

// LOOP_BEGIN:
    // Push inputs on the stack.
    labels.define_label(LOOP_BEGIN, stub, false);

    stub = push(stub, M_Index_Opnd(esp_reg, eax_reg, 0, 1));
    stub = loop(stub, Imm_Opnd(size_8, 0));
    labels.add_patch_to_label(LOOP_BEGIN, stub - 1, LPT_Rel8);

// LOOP_END:
    labels.define_label(LOOP_END, stub, false);

    // Call target function.
    stub = mov(stub, eax_opnd, M_Base_Opnd(ebp_reg, STACK_FUNC_OFFSET));
    stub = call(stub, eax_opnd);

    // Restore callee-saved registers from the stack.
    stub = lea(stub, esp_opnd, M_Base_Opnd(ebp_reg, STACK_CALLEE_SAVED_OFFSET));
    stub = pop(stub, edi_opnd);
    stub = pop(stub, esi_opnd);
    stub = pop(stub, ebx_opnd);

    // Leave current frame.
    stub = pop(stub, ebp_opnd);

    return stub;
}
address JNI_FastGetField::generate_fast_get_int_field0(BasicType type) {
  const char *name;
  switch (type) {
    case T_BOOLEAN: name = "jni_fast_GetBooleanField"; break;
    case T_BYTE:    name = "jni_fast_GetByteField";    break;
    case T_CHAR:    name = "jni_fast_GetCharField";    break;
    case T_SHORT:   name = "jni_fast_GetShortField";   break;
    case T_INT:     name = "jni_fast_GetIntField";     break;
    case T_LONG:    name = "jni_fast_GetLongField";    break;
    case T_FLOAT:   name = "jni_fast_GetFloatField";   break;
    case T_DOUBLE:  name = "jni_fast_GetDoubleField";  break;
    default:        ShouldNotReachHere();
  }
  ResourceMark rm;
  BufferBlob* blob = BufferBlob::create(name, BUFFER_SIZE);
  CodeBuffer cbuf(blob);
  MacroAssembler* masm = new MacroAssembler(&cbuf);
  address fast_entry = __ pc();

  Label slow;

  unsigned long offset;
  __ adrp(rcounter_addr,
	  SafepointSynchronize::safepoint_counter_addr(), offset);
  Address safepoint_counter_addr(rcounter_addr, offset);
  __ ldrw(rcounter, safepoint_counter_addr);
  __ andw(rscratch1, rcounter, 1);
  __ cbnzw(rscratch1, slow);
  __ eor(robj, c_rarg1, rcounter);
  __ eor(robj, robj, rcounter);               // obj, since
                                              // robj ^ rcounter ^ rcounter == robj
                                              // robj is address dependent on rcounter.
  __ ldr(robj, Address(robj, 0));             // *obj
  __ lsr(roffset, c_rarg2, 2);                // offset

  assert(count < LIST_CAPACITY, "LIST_CAPACITY too small");
  speculative_load_pclist[count] = __ pc();   // Used by the segfault handler
  switch (type) {
    case T_BOOLEAN: __ ldrb    (result, Address(robj, roffset)); break;
    case T_BYTE:    __ ldrsb   (result, Address(robj, roffset)); break;
    case T_CHAR:    __ ldrh    (result, Address(robj, roffset)); break;
    case T_SHORT:   __ ldrsh   (result, Address(robj, roffset)); break;
    case T_FLOAT:   __ ldrw    (result, Address(robj, roffset)); break;
    case T_INT:     __ ldrsw   (result, Address(robj, roffset)); break;
    case T_DOUBLE:
    case T_LONG:    __ ldr     (result, Address(robj, roffset)); break;
    default:        ShouldNotReachHere();
  }

  // counter_addr is address dependent on result.
  __ eor(rcounter_addr, rcounter_addr, result);
  __ eor(rcounter_addr, rcounter_addr, result);
  __ ldrw(rscratch1, safepoint_counter_addr);
  __ cmpw(rcounter, rscratch1);
  __ br (Assembler::NE, slow);

  switch (type) {
    case T_FLOAT:   __ fmovs(v0, result); break;
    case T_DOUBLE:  __ fmovd(v0, result); break;
    default:        __ mov(r0, result);   break;
  }
  __ ret(lr);

  slowcase_entry_pclist[count++] = __ pc();
  __ bind(slow);
  address slow_case_addr;
  switch (type) {
    case T_BOOLEAN: slow_case_addr = jni_GetBooleanField_addr(); break;
    case T_BYTE:    slow_case_addr = jni_GetByteField_addr();    break;
    case T_CHAR:    slow_case_addr = jni_GetCharField_addr();    break;
    case T_SHORT:   slow_case_addr = jni_GetShortField_addr();   break;
    case T_INT:     slow_case_addr = jni_GetIntField_addr();     break;
    case T_LONG:    slow_case_addr = jni_GetLongField_addr();    break;
    case T_FLOAT:   slow_case_addr = jni_GetFloatField_addr();   break;
    case T_DOUBLE:  slow_case_addr = jni_GetDoubleField_addr();  break;
    default:        ShouldNotReachHere();
  }

  {
    __ enter();
    __ lea(rscratch1, ExternalAddress(slow_case_addr));
    __ blr(rscratch1);
    __ maybe_isb();
    __ leave();
    __ ret(lr);
  }
  __ flush ();

  return fast_entry;
}
void GPUDrawScanlineCodeGenerator::Init()
{
	mov(eax, dword[esp + _top]);

	// uint16* fb = (uint16*)m_global.vm + (top << (10 + sel.scalex)) + left;

	mov(edi, eax);
	shl(edi, 10 + m_sel.scalex);
	add(edi, edx);
	lea(edi, ptr[edi * 2 + (size_t)m_local.gd->vm]);

	// int steps = pixels - 8;

	sub(ecx, 8);

	if(m_sel.dtd)
	{
		// dither = GSVector4i::load<false>(&m_dither[top & 3][left & 3]);

		and(eax, 3);
		shl(eax, 5);
		and(edx, 3);
		shl(edx, 1);
		movdqu(xmm0, ptr[eax + edx + (size_t)m_dither]);
		movdqa(ptr[&m_local.temp.dither], xmm0);
	}

	mov(edx, dword[esp + _v]);

	if(m_sel.tme)
	{
		mov(esi, dword[&m_local.gd->tex]);

		// GSVector4i vt = GSVector4i(v.t).xxzzl();

		cvttps2dq(xmm4, ptr[edx + offsetof(GSVertexSW, t)]);
		pshuflw(xmm4, xmm4, _MM_SHUFFLE(2, 2, 0, 0));

		// s = vt.xxxx().add16(m_local.d.s);
		// t = vt.yyyy().add16(m_local.d.t);

		pshufd(xmm2, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
		pshufd(xmm3, xmm4, _MM_SHUFFLE(1, 1, 1, 1));

		paddw(xmm2, ptr[&m_local.d.s]);

		if(!m_sel.sprite)
		{
			paddw(xmm3, ptr[&m_local.d.t]);
		}
		else
		{
			if(m_sel.ltf)
			{
				movdqa(xmm0, xmm3);
				psllw(xmm0, 8);
				psrlw(xmm0, 1);
				movdqa(ptr[&m_local.temp.vf], xmm0);
			}
		}

		movdqa(ptr[&m_local.temp.s], xmm2);
		movdqa(ptr[&m_local.temp.t], xmm3);
	}

	if(m_sel.tfx != 3) // != decal
	{
		// GSVector4i vc = GSVector4i(v.c).xxzzlh();

		cvttps2dq(xmm6, ptr[edx + offsetof(GSVertexSW, c)]);
		pshuflw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));
		pshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));

		// r = vc.xxxx();
		// g = vc.yyyy();
		// b = vc.zzzz();

		pshufd(xmm4, xmm6, _MM_SHUFFLE(0, 0, 0, 0));
		pshufd(xmm5, xmm6, _MM_SHUFFLE(1, 1, 1, 1));
		pshufd(xmm6, xmm6, _MM_SHUFFLE(2, 2, 2, 2));

		if(m_sel.iip)
		{
			// r = r.add16(m_local.d.r);
			// g = g.add16(m_local.d.g);
			// b = b.add16(m_local.d.b);

			paddw(xmm4, ptr[&m_local.d.r]);
			paddw(xmm5, ptr[&m_local.d.g]);
			paddw(xmm6, ptr[&m_local.d.b]);
		}

		movdqa(ptr[&m_local.temp.r], xmm4);
		movdqa(ptr[&m_local.temp.g], xmm5);
		movdqa(ptr[&m_local.temp.b], xmm6);
	}
}
void CompactingPermGenGen::generate_vtable_methods(void** vtbl_list,
                                                   void** vtable,
                                                   char** md_top,
                                                   char* md_end,
                                                   char** mc_top,
                                                   char* mc_end) {

  intptr_t vtable_bytes = (num_virtuals * vtbl_list_size) * sizeof(void*);
  *(intptr_t *)(*md_top) = vtable_bytes;
  *md_top += sizeof(intptr_t);
  void** dummy_vtable = (void**)*md_top;
  *vtable = dummy_vtable;
  *md_top += vtable_bytes;

  // Get ready to generate dummy methods.

  CodeBuffer cb((unsigned char*)*mc_top, mc_end - *mc_top);
  MacroAssembler* masm = new MacroAssembler(&cb);

  Label common_code;
  for (int i = 0; i < vtbl_list_size; ++i) {
    for (int j = 0; j < num_virtuals; ++j) {
      dummy_vtable[num_virtuals * i + j] = (void*)masm->pc();

      // Load eax with a value indicating vtable/offset pair.
      // -- bits[ 7..0]  (8 bits) which virtual method in table?
      // -- bits[12..8]  (5 bits) which virtual method table?
      // -- must fit in 13-bit instruction immediate field.
      __ movl(rax, (i << 8) + j);
      __ jmp(common_code);
    }
  }

  __ bind(common_code);

  // Expecting to be called with "thiscall" convections -- the arguments
  // are on the stack and the "this" pointer is in c_rarg0. In addition, rax
  // was set (above) to the offset of the method in the table.

  __ push(c_rarg1);                     // save & free register
  __ push(c_rarg0);                     // save "this"
  __ mov(c_rarg0, rax);
  __ shrptr(c_rarg0, 8);                // isolate vtable identifier.
  __ shlptr(c_rarg0, LogBytesPerWord);
  __ lea(c_rarg1, ExternalAddress((address)vtbl_list)); // ptr to correct vtable list.
  __ addptr(c_rarg1, c_rarg0);          // ptr to list entry.
  __ movptr(c_rarg1, Address(c_rarg1, 0));      // get correct vtable address.
  __ pop(c_rarg0);                      // restore "this"
  __ movptr(Address(c_rarg0, 0), c_rarg1);      // update vtable pointer.

  __ andptr(rax, 0x00ff);                       // isolate vtable method index
  __ shlptr(rax, LogBytesPerWord);
  __ addptr(rax, c_rarg1);              // address of real method pointer.
  __ pop(c_rarg1);                      // restore register.
  __ movptr(rax, Address(rax, 0));      // get real method pointer.
  __ jmp(rax);                          // jump to the real method.

  __ flush();

  *mc_top = (char*)__ pc();
}