Example #1
0
	Encoding *RegisterAllocator::spill128(int i)
	{
		// Register loaded but not used, eliminate load and don't spill
		if(XMM[i].loadInstruction && loadElimination)
		{
			XMM[i].loadInstruction->reserve();
			XMM[i].loadInstruction = 0;
	
			XMM[i].reference = 0;
			XMM[i].priority = 0;
			XMM[i].partial = 0;
			XMM[i].copyInstruction = 0;
			XMM[i].loadInstruction = 0;
		//	XMM[i].spillInstruction = 0;   // NOTE: Keep previous spill info
	
			return 0;
		}

		Encoding *spillInstruction = 0;

		if(XMM[i].reference != 0 && (XMM[i].modified || !dropUnmodified))
		{
			if(XMM[i].partial) spillInstruction = movss(dword_ptr [XMM[i].reference], OperandXMMREG(i));
			else               spillInstruction = movaps(xword_ptr [XMM[i].reference], OperandXMMREG(i));
		}
		
		XMM[i].free();

		return spillInstruction;
	}
void InterpreterRuntime::SignatureHandlerGenerator::pass_float()
{
  const Address src(from(), -offset() * wordSize);

#ifdef _WIN64
  if (_num_args < Argument::n_float_register_parameters-1) {
    __ movss(as_FloatRegister(++_num_args), src);
  } else {
    __ movl(rax, src);
    __ movl(Address(to(), _stack_offset), rax);
    _stack_offset += wordSize;
  }
#else
  if (_num_fp_args < Argument::n_float_register_parameters) {
    __ movss(as_FloatRegister(_num_fp_args++), src);
  } else {
    __ movl(rax, src);
    __ movl(Address(to(), _stack_offset), rax);
    _stack_offset += wordSize;
  }
#endif
}
Example #3
0
	OperandXMMREG RegisterAllocator::allocate128(int i, const OperandREF &ref, bool copy, bool ss)
	{
		XMM[i].reference = ref;
		XMM[i].partial = ss ? 4 : 0;

		prioritize128(i);

		Encoding *loadInstruction = 0;
		Encoding *spillInstruction = XMM[i].spillInstruction;
		AllocationData spillAllocation = XMM[i].spill;

		if(copy)
		{
			if(ss) loadInstruction = movss(OperandXMMREG(i), dword_ptr [ref]);
			else   loadInstruction = movaps(OperandXMMREG(i), xword_ptr [ref]);
		}

		XMM[i].loadInstruction = loadInstruction;
		XMM[i].spillInstruction = spillInstruction;
		XMM[i].spill = spillAllocation;
		XMM[i].modified = false;

		return OperandXMMREG(i);
	}
void GSSetupPrimCodeGenerator::Depth()
{
	if(!m_en.z && !m_en.f)
	{
		return;
	}

	if(!m_env.sel.sprite)
	{
		// GSVector4 t = dscan.p;

		movaps(xmm0, xmmword[edx + 16]);

		if(m_en.f)
		{
			// GSVector4 df = p.wwww();

			movaps(xmm1, xmm0);
			shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));

			// m_env.d4.f = GSVector4i(df * 4.0f).xxzzlh();

			movaps(xmm2, xmm1);
			mulps(xmm2, xmm3);
			cvttps2dq(xmm2, xmm2);
			pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
			pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
			movdqa(xmmword[&m_env.d4.f], xmm2);

			for(int i = 0; i < 4; i++)
			{
				// m_env.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();

				movaps(xmm2, xmm1);
				mulps(xmm2, Xmm(4 + i));
				cvttps2dq(xmm2, xmm2);
				pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
				pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
				movdqa(xmmword[&m_env.d[i].f], xmm2);
			}
		}

		if(m_en.z)
		{
			// GSVector4 dz = p.zzzz();

			shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));

			// m_env.d4.z = dz * 4.0f;

			movaps(xmm1, xmm0);
			mulps(xmm1, xmm3);
			movdqa(xmmword[&m_env.d4.z], xmm1);

			for(int i = 0; i < 4; i++)
			{
				// m_env.d[i].z = dz * m_shift[i];

				movaps(xmm1, xmm0);
				mulps(xmm1, Xmm(4 + i));
				movdqa(xmmword[&m_env.d[i].z], xmm1);
			}
		}
	}
	else
	{
		// GSVector4 p = vertices[0].p;

		movaps(xmm0, xmmword[ecx + 16]);

		if(m_en.f)
		{
			// m_env.p.f = GSVector4i(p).zzzzh().zzzz();

			movaps(xmm1, xmm0);
			cvttps2dq(xmm1, xmm1);
			pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
			pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
			movdqa(xmmword[&m_env.p.f], xmm1);
		}

		if(m_en.z)
		{
			// GSVector4 z = p.zzzz();

			shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));

			if(m_env.sel.zoverflow)
			{
				// m_env.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());

				static const float half = 0.5f;

				movss(xmm1, dword[&half]);
				shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
				mulps(xmm1, xmm0);
				cvttps2dq(xmm1, xmm1);
				pslld(xmm1, 1);

				cvttps2dq(xmm0, xmm0);
				pcmpeqd(xmm2, xmm2);
				psrld(xmm2, 31);
				pand(xmm0, xmm2);
				
				por(xmm0, xmm1);
			}
			else
			{
				// m_env.p.z = GSVector4i(z);

				cvttps2dq(xmm0, xmm0);
			}

			movdqa(xmmword[&m_env.p.z], xmm0);
		}
	}
}
address JNI_FastGetField::generate_fast_get_float_field0(BasicType type) {
    const char *name;
    switch (type) {
    case T_FLOAT:
        name = "jni_fast_GetFloatField";
        break;
    case T_DOUBLE:
        name = "jni_fast_GetDoubleField";
        break;
    default:
        ShouldNotReachHere();
    }
    ResourceMark rm;
    BufferBlob* b = BufferBlob::create(name, BUFFER_SIZE);
    address fast_entry = b->instructions_begin();
    CodeBuffer* cbuf = new CodeBuffer(fast_entry, b->instructions_size());
    MacroAssembler* masm = new MacroAssembler(cbuf);

    Label slow;

    address counter_addr = SafepointSynchronize::safepoint_counter_addr();
    Address ca(counter_addr, relocInfo::none);
    __ movl (rcounter, ca);
    __ movq (robj, rarg1);
    __ testb (rcounter, 1);
    __ jcc (Assembler::notZero, slow);
    if (os::is_MP()) {
        __ xorq (robj, rcounter);
        __ xorq (robj, rcounter);                   // obj, since
        // robj ^ rcounter ^ rcounter == robj
        // robj is data dependent on rcounter.
    }
    __ movq (robj, Address(robj));                // *obj
    __ movq (roffset, rarg2);
    __ shrq (roffset, 2);                         // offset

    assert(count < LIST_CAPACITY, "LIST_CAPACITY too small");
    speculative_load_pclist[count] = __ pc();
    switch (type) {
    case T_FLOAT:
        __ movss  (xmm0, Address(robj, roffset, Address::times_1));
        break;
    case T_DOUBLE:
        __ movlpd (xmm0, Address(robj, roffset, Address::times_1));
        break;
    default:
        ShouldNotReachHere();
    }

    __ movq (rcounter_addr, (int64_t)counter_addr);
    ca = Address(rcounter_addr);
    if (os::is_MP()) {
        __ movdq (rax, xmm0);
        __ xorq (rcounter_addr, rax);
        __ xorq (rcounter_addr, rax);               // ca is data dependent on xmm0.
    }
    __ cmpl (rcounter, ca);
    __ jcc (Assembler::notEqual, slow);

    __ ret (0);

    slowcase_entry_pclist[count++] = __ pc();
    __ bind (slow);
    address slow_case_addr;
    switch (type) {
    case T_FLOAT:
        slow_case_addr = jni_GetFloatField_addr();
        break;
    case T_DOUBLE:
        slow_case_addr = jni_GetDoubleField_addr();
    }
    // tail call
    __ jmp (slow_case_addr, relocInfo::none);

    __ flush ();

    return fast_entry;
}
  //------------------------------------------------------------------------------------------------------------------------
  // Continuation point for throwing of implicit exceptions that are not handled in
  // the current activation. Fabricates an exception oop and initiates normal
  // exception dispatching in this frame. Since we need to preserve callee-saved values
  // (currently only for C2, but done for C1 as well) we need a callee-saved oop map and
  // therefore have to make these stubs into RuntimeStubs rather than BufferBlobs.
  // If the compiler needs all registers to be preserved between the fault
  // point and the exception handler then it must assume responsibility for that in
  // AbstractCompiler::continuation_for_implicit_null_exception or
  // continuation_for_implicit_division_by_zero_exception. All other implicit
  // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
  // either at call sites or otherwise assume that stack unwinding will be initiated,
  // so caller saved registers were assumed volatile in the compiler.
  //
  // Note: the routine set_pc_not_at_call_for_caller in SharedRuntime.cpp requires
  // that this code be generated into a RuntimeStub.
  address StubGenerator::generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc) {

    int insts_size = 256;
    int locs_size  = 32;

    CodeBuffer* code     = new CodeBuffer(insts_size, locs_size, 0, 0, 0, false, NULL, NULL, NULL, false, NULL, name, false);
    OopMapSet* oop_maps  = new OopMapSet();
    MacroAssembler* masm = new MacroAssembler(code);

    address start = __ pc();

    // This is an inlined and slightly modified version of call_VM
    // which has the ability to fetch the return PC out of
    // thread-local storage and also sets up last_Java_sp slightly
    // differently than the real call_VM
    Register java_thread = ebx;
    __ get_thread(java_thread);
    if (restore_saved_exception_pc) {
      __ movl(eax, Address(java_thread, in_bytes(JavaThread::saved_exception_pc_offset())));
      __ pushl(eax);
    }
      
#ifndef COMPILER2
    __ enter(); // required for proper stackwalking of RuntimeStub frame
#endif COMPILER2

    __ subl(esp, framesize * wordSize); // prolog

#ifdef COMPILER2
    if( OptoRuntimeCalleeSavedFloats ) {
      if( UseSSE == 1 ) {
        __ movss(Address(esp,xmm6_off*wordSize),xmm6);
        __ movss(Address(esp,xmm7_off*wordSize),xmm7);
      } else if( UseSSE == 2 ) {
        __ movsd(Address(esp,xmm6_off*wordSize),xmm6);
        __ movsd(Address(esp,xmm7_off*wordSize),xmm7);
      }
    }
#endif /* COMPILER2 */
    __ movl(Address(esp, ebp_off * wordSize), ebp);
    __ movl(Address(esp, edi_off * wordSize), edi);
    __ movl(Address(esp, esi_off * wordSize), esi);

    // push java thread (becomes first argument of C function)
    __ movl(Address(esp, thread_off * wordSize), java_thread);

    // Set up last_Java_sp and last_Java_fp
    __ set_last_Java_frame(java_thread, esp, ebp, NULL);

    // Call runtime
    __ call(runtime_entry, relocInfo::runtime_call_type);
    // Generate oop map
    OopMap* map =  new OopMap(framesize, 0);        
#ifdef COMPILER2
    // SharedInfo is apparently not initialized if -Xint is specified
    if (UseCompiler) {
      map->set_callee_saved(SharedInfo::stack2reg(ebp_off), framesize, 0, OptoReg::Name(EBP_num));
      map->set_callee_saved(SharedInfo::stack2reg(edi_off), framesize, 0, OptoReg::Name(EDI_num));
      map->set_callee_saved(SharedInfo::stack2reg(esi_off), framesize, 0, OptoReg::Name(ESI_num));
      if( OptoRuntimeCalleeSavedFloats ) {
        map->set_callee_saved(SharedInfo::stack2reg(xmm6_off  ), framesize, 0, OptoReg::Name(XMM6a_num));
        map->set_callee_saved(SharedInfo::stack2reg(xmm6_off+1), framesize, 0, OptoReg::Name(XMM6b_num));
        map->set_callee_saved(SharedInfo::stack2reg(xmm7_off  ), framesize, 0, OptoReg::Name(XMM7a_num));
        map->set_callee_saved(SharedInfo::stack2reg(xmm7_off+1), framesize, 0, OptoReg::Name(XMM7b_num));
      }
    }
#endif
#ifdef COMPILER1
    map->set_callee_saved(OptoReg::Name(SharedInfo::stack0+ebp_off), framesize, 0, OptoReg::Name(ebp->encoding()));
    map->set_callee_saved(OptoReg::Name(SharedInfo::stack0+esi_off), framesize, 0, OptoReg::Name(esi->encoding()));
    map->set_callee_saved(OptoReg::Name(SharedInfo::stack0+edi_off), framesize, 0, OptoReg::Name(edi->encoding()));
#endif
    oop_maps->add_gc_map(__ pc() - start, true, map);
      
    // restore the thread (cannot use the pushed argument since arguments
    // may be overwritten by C code generated by an optimizing compiler);
    // however can use the register value directly if it is callee saved.
    __ get_thread(java_thread);

    __ reset_last_Java_frame(java_thread, false);

    // Restore callee save registers.  This must be done after resetting the Java frame
#ifdef COMPILER2
    if( OptoRuntimeCalleeSavedFloats ) {
      if( UseSSE == 1 ) {
        __ movss(xmm6,Address(esp,xmm6_off*wordSize));
        __ movss(xmm7,Address(esp,xmm7_off*wordSize));
      } else if( UseSSE == 2 ) {
        __ movsd(xmm6,Address(esp,xmm6_off*wordSize));
        __ movsd(xmm7,Address(esp,xmm7_off*wordSize));
      }
    }
#endif /* COMPILER2 */
    __ movl(ebp,Address(esp, ebp_off * wordSize));
    __ movl(edi,Address(esp, edi_off * wordSize));
    __ movl(esi,Address(esp, esi_off * wordSize));

    // discard arguments
    __ addl(esp, framesize * wordSize); // epilog

#ifndef COMPILER2
    __ leave(); // required for proper stackwalking of RuntimeStub frame
#endif COMPILER2

    // check for pending exceptions
#ifdef ASSERT
    Label L;
    __ cmpl(Address(java_thread, Thread::pending_exception_offset()), (int)NULL);
    __ jcc(Assembler::notEqual, L);
    __ should_not_reach_here();
    __ bind(L);
#endif ASSERT
    __ jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);

    // Note: it seems the frame size reported to the RuntimeStub has
    // to be incremented by 1 to account for the return PC. It
    // definitely must be one more than the amount by which SP was
    // decremented.
    int extra_words = 1;
#ifdef COMPILER1
    ++extra_words; // Not strictly necessary since C1 ignores frame size and uses link
#endif COMPILER1

    RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, code, framesize + extra_words, oop_maps, false);
    return stub->entry_point();
  }
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
{
    if(!m_sel.zb)
    {
        return;
    }

    // int za = fza_base.y + fza_offset->y;

    mov(ebp, dword[esi + 4]);
    add(ebp, dword[edi + 4]);

    // GSVector4i zs = zi;

    if(!m_sel.sprite)
    {
        if(m_sel.zoverflow)
        {
            // zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());

            static float half = 0.5f;

            movss(temp1, dword[&half]);
            shufps(temp1, temp1, _MM_SHUFFLE(0, 0, 0, 0));
            mulps(temp1, xmm0);
            cvttps2dq(temp1, temp1);
            pslld(temp1, 1);

            cvttps2dq(xmm0, xmm0);
            pcmpeqd(temp2, temp2);
            psrld(temp2, 31);
            pand(xmm0, temp2);

            por(xmm0, temp1);
        }
        else
        {
            // zs = GSVector4i(z);

            cvttps2dq(xmm0, xmm0);
        }

        if(m_sel.zwrite)
        {
            movdqa(xmmword[&m_env.temp.zs], xmm0);
        }
    }

    if(m_sel.ztest)
    {
        ReadPixel(xmm1, ebp);

        if(m_sel.zwrite && m_sel.zpsm < 2)
        {
            movdqa(xmmword[&m_env.temp.zd], xmm1);
        }

        // zd &= 0xffffffff >> m_sel.zpsm * 8;

        if(m_sel.zpsm)
        {
            pslld(xmm1, m_sel.zpsm * 8);
            psrld(xmm1, m_sel.zpsm * 8);
        }

        if(m_sel.zoverflow || m_sel.zpsm == 0)
        {
            // GSVector4i o = GSVector4i::x80000000();

            pcmpeqd(xmm4, xmm4);
            pslld(xmm4, 31);

            // GSVector4i zso = zs - o;

            psubd(xmm0, xmm4);

            // GSVector4i zdo = zd - o;

            psubd(xmm1, xmm4);
        }

        switch(m_sel.ztst)
        {
        case ZTST_GEQUAL:
            // test |= zso < zdo; // ~(zso >= zdo)
            pcmpgtd(xmm1, xmm0);
            por(xmm7, xmm1);
            break;

        case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL
            // test |= zso <= zdo; // ~(zso > zdo)
            pcmpgtd(xmm0, xmm1);
            pcmpeqd(xmm4, xmm4);
            pxor(xmm0, xmm4);
            por(xmm7, xmm0);
            break;
        }

        alltrue();
    }
}
Example #8
0
float TfirFilter::vec_inner_prod_sse(const float *eax, const float *edi, int ecx)
{
  __m128 xmm3,xmm4,xmm0,xmm1,xmm5,xmm6;
  xorps (xmm3, xmm3);
  xorps (xmm4, xmm4);

  ecx-=8;// sub $8, %%ecx
  if (ecx<0) goto //jb
   mul8_skip;

mul8_loop:
  movups (xmm0,eax);
  movups (xmm1,edi);
  movups (xmm5,16/sizeof(float)+eax);
  movups (xmm6,16/sizeof(float)+edi);
  eax+=32/sizeof(float);
  edi+=32/sizeof(float);
  mulps (xmm1,xmm0);
  mulps (xmm6,xmm5);
  addps (xmm3,xmm1);
  addps (xmm4,xmm6);

  ecx-=8;
  if (ecx>=0) //jae
   goto mul8_loop;

mul8_skip:

  addps (xmm3,xmm4);

  ecx+=4;
  if (ecx<0) //jl
   goto mul4_skip;

  movups (xmm0,eax);
  movups (xmm1,edi);
  eax+=16/sizeof(float);
  edi+=16/sizeof(float);
  mulps (xmm1, xmm0);
  addps (xmm3, xmm1);

  ecx-=4;

mul4_skip:

  ecx+=4;

  goto cond1;

mul1_loop:
  movss (xmm0,eax);
  movss (xmm1,edi);
  eax+=4/sizeof(float);
  edi+=4/sizeof(float);
  mulss (xmm1,xmm0);
  addss (xmm3,xmm1);

cond1:
  ecx-=1;
  if (ecx>=0) // jae
   goto mul1_loop;

  movhlps  (xmm4,xmm3);
  addps    (xmm3,xmm4);
  movaps   (xmm4,xmm3);
  //FIXME: which one?
  xmm4=_mm_shuffle_ps(xmm4,xmm4,0x55);// shufps $0x55, xmm4, xmm4
                                      // shufps $33, xmm4, xmm4
  addss    (xmm3, xmm4);
  float sum;
  movss    (&sum , xmm3);
  return sum;
}