Encoding *RegisterAllocator::spill128(int i) { // Register loaded but not used, eliminate load and don't spill if(XMM[i].loadInstruction && loadElimination) { XMM[i].loadInstruction->reserve(); XMM[i].loadInstruction = 0; XMM[i].reference = 0; XMM[i].priority = 0; XMM[i].partial = 0; XMM[i].copyInstruction = 0; XMM[i].loadInstruction = 0; // XMM[i].spillInstruction = 0; // NOTE: Keep previous spill info return 0; } Encoding *spillInstruction = 0; if(XMM[i].reference != 0 && (XMM[i].modified || !dropUnmodified)) { if(XMM[i].partial) spillInstruction = movss(dword_ptr [XMM[i].reference], OperandXMMREG(i)); else spillInstruction = movaps(xword_ptr [XMM[i].reference], OperandXMMREG(i)); } XMM[i].free(); return spillInstruction; }
void InterpreterRuntime::SignatureHandlerGenerator::pass_float() { const Address src(from(), -offset() * wordSize); #ifdef _WIN64 if (_num_args < Argument::n_float_register_parameters-1) { __ movss(as_FloatRegister(++_num_args), src); } else { __ movl(rax, src); __ movl(Address(to(), _stack_offset), rax); _stack_offset += wordSize; } #else if (_num_fp_args < Argument::n_float_register_parameters) { __ movss(as_FloatRegister(_num_fp_args++), src); } else { __ movl(rax, src); __ movl(Address(to(), _stack_offset), rax); _stack_offset += wordSize; } #endif }
OperandXMMREG RegisterAllocator::allocate128(int i, const OperandREF &ref, bool copy, bool ss) { XMM[i].reference = ref; XMM[i].partial = ss ? 4 : 0; prioritize128(i); Encoding *loadInstruction = 0; Encoding *spillInstruction = XMM[i].spillInstruction; AllocationData spillAllocation = XMM[i].spill; if(copy) { if(ss) loadInstruction = movss(OperandXMMREG(i), dword_ptr [ref]); else loadInstruction = movaps(OperandXMMREG(i), xword_ptr [ref]); } XMM[i].loadInstruction = loadInstruction; XMM[i].spillInstruction = spillInstruction; XMM[i].spill = spillAllocation; XMM[i].modified = false; return OperandXMMREG(i); }
void GSSetupPrimCodeGenerator::Depth() { if(!m_en.z && !m_en.f) { return; } if(!m_env.sel.sprite) { // GSVector4 t = dscan.p; movaps(xmm0, xmmword[edx + 16]); if(m_en.f) { // GSVector4 df = p.wwww(); movaps(xmm1, xmm0); shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); // m_env.d4.f = GSVector4i(df * 4.0f).xxzzlh(); movaps(xmm2, xmm1); mulps(xmm2, xmm3); cvttps2dq(xmm2, xmm2); pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); movdqa(xmmword[&m_env.d4.f], xmm2); for(int i = 0; i < 4; i++) { // m_env.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); movaps(xmm2, xmm1); mulps(xmm2, Xmm(4 + i)); cvttps2dq(xmm2, xmm2); pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); movdqa(xmmword[&m_env.d[i].f], xmm2); } } if(m_en.z) { // GSVector4 dz = p.zzzz(); shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); // m_env.d4.z = dz * 4.0f; movaps(xmm1, xmm0); mulps(xmm1, xmm3); movdqa(xmmword[&m_env.d4.z], xmm1); for(int i = 0; i < 4; i++) { // m_env.d[i].z = dz * m_shift[i]; movaps(xmm1, xmm0); mulps(xmm1, Xmm(4 + i)); movdqa(xmmword[&m_env.d[i].z], xmm1); } } } else { // GSVector4 p = vertices[0].p; movaps(xmm0, xmmword[ecx + 16]); if(m_en.f) { // m_env.p.f = GSVector4i(p).zzzzh().zzzz(); movaps(xmm1, xmm0); cvttps2dq(xmm1, xmm1); pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); movdqa(xmmword[&m_env.p.f], xmm1); } if(m_en.z) { // GSVector4 z = p.zzzz(); shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); if(m_env.sel.zoverflow) { // m_env.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); static const float half = 0.5f; movss(xmm1, dword[&half]); shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0)); mulps(xmm1, xmm0); cvttps2dq(xmm1, xmm1); pslld(xmm1, 1); cvttps2dq(xmm0, xmm0); pcmpeqd(xmm2, xmm2); psrld(xmm2, 31); pand(xmm0, xmm2); por(xmm0, xmm1); } else { // m_env.p.z = GSVector4i(z); cvttps2dq(xmm0, xmm0); } movdqa(xmmword[&m_env.p.z], xmm0); } } }
address JNI_FastGetField::generate_fast_get_float_field0(BasicType type) { const char *name; switch (type) { case T_FLOAT: name = "jni_fast_GetFloatField"; break; case T_DOUBLE: name = "jni_fast_GetDoubleField"; break; default: ShouldNotReachHere(); } ResourceMark rm; BufferBlob* b = BufferBlob::create(name, BUFFER_SIZE); address fast_entry = b->instructions_begin(); CodeBuffer* cbuf = new CodeBuffer(fast_entry, b->instructions_size()); MacroAssembler* masm = new MacroAssembler(cbuf); Label slow; address counter_addr = SafepointSynchronize::safepoint_counter_addr(); Address ca(counter_addr, relocInfo::none); __ movl (rcounter, ca); __ movq (robj, rarg1); __ testb (rcounter, 1); __ jcc (Assembler::notZero, slow); if (os::is_MP()) { __ xorq (robj, rcounter); __ xorq (robj, rcounter); // obj, since // robj ^ rcounter ^ rcounter == robj // robj is data dependent on rcounter. } __ movq (robj, Address(robj)); // *obj __ movq (roffset, rarg2); __ shrq (roffset, 2); // offset assert(count < LIST_CAPACITY, "LIST_CAPACITY too small"); speculative_load_pclist[count] = __ pc(); switch (type) { case T_FLOAT: __ movss (xmm0, Address(robj, roffset, Address::times_1)); break; case T_DOUBLE: __ movlpd (xmm0, Address(robj, roffset, Address::times_1)); break; default: ShouldNotReachHere(); } __ movq (rcounter_addr, (int64_t)counter_addr); ca = Address(rcounter_addr); if (os::is_MP()) { __ movdq (rax, xmm0); __ xorq (rcounter_addr, rax); __ xorq (rcounter_addr, rax); // ca is data dependent on xmm0. } __ cmpl (rcounter, ca); __ jcc (Assembler::notEqual, slow); __ ret (0); slowcase_entry_pclist[count++] = __ pc(); __ bind (slow); address slow_case_addr; switch (type) { case T_FLOAT: slow_case_addr = jni_GetFloatField_addr(); break; case T_DOUBLE: slow_case_addr = jni_GetDoubleField_addr(); } // tail call __ jmp (slow_case_addr, relocInfo::none); __ flush (); return fast_entry; }
//------------------------------------------------------------------------------------------------------------------------ // Continuation point for throwing of implicit exceptions that are not handled in // the current activation. Fabricates an exception oop and initiates normal // exception dispatching in this frame. Since we need to preserve callee-saved values // (currently only for C2, but done for C1 as well) we need a callee-saved oop map and // therefore have to make these stubs into RuntimeStubs rather than BufferBlobs. // If the compiler needs all registers to be preserved between the fault // point and the exception handler then it must assume responsibility for that in // AbstractCompiler::continuation_for_implicit_null_exception or // continuation_for_implicit_division_by_zero_exception. All other implicit // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are // either at call sites or otherwise assume that stack unwinding will be initiated, // so caller saved registers were assumed volatile in the compiler. // // Note: the routine set_pc_not_at_call_for_caller in SharedRuntime.cpp requires // that this code be generated into a RuntimeStub. address StubGenerator::generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc) { int insts_size = 256; int locs_size = 32; CodeBuffer* code = new CodeBuffer(insts_size, locs_size, 0, 0, 0, false, NULL, NULL, NULL, false, NULL, name, false); OopMapSet* oop_maps = new OopMapSet(); MacroAssembler* masm = new MacroAssembler(code); address start = __ pc(); // This is an inlined and slightly modified version of call_VM // which has the ability to fetch the return PC out of // thread-local storage and also sets up last_Java_sp slightly // differently than the real call_VM Register java_thread = ebx; __ get_thread(java_thread); if (restore_saved_exception_pc) { __ movl(eax, Address(java_thread, in_bytes(JavaThread::saved_exception_pc_offset()))); __ pushl(eax); } #ifndef COMPILER2 __ enter(); // required for proper stackwalking of RuntimeStub frame #endif COMPILER2 __ subl(esp, framesize * wordSize); // prolog #ifdef COMPILER2 if( OptoRuntimeCalleeSavedFloats ) { if( UseSSE == 1 ) { __ movss(Address(esp,xmm6_off*wordSize),xmm6); __ movss(Address(esp,xmm7_off*wordSize),xmm7); } else if( UseSSE == 2 ) { __ movsd(Address(esp,xmm6_off*wordSize),xmm6); __ movsd(Address(esp,xmm7_off*wordSize),xmm7); } } #endif /* COMPILER2 */ __ movl(Address(esp, ebp_off * wordSize), ebp); __ movl(Address(esp, edi_off * wordSize), edi); __ movl(Address(esp, esi_off * wordSize), esi); // push java thread (becomes first argument of C function) __ movl(Address(esp, thread_off * wordSize), java_thread); // Set up last_Java_sp and last_Java_fp __ set_last_Java_frame(java_thread, esp, ebp, NULL); // Call runtime __ call(runtime_entry, relocInfo::runtime_call_type); // Generate oop map OopMap* map = new OopMap(framesize, 0); #ifdef COMPILER2 // SharedInfo is apparently not initialized if -Xint is specified if (UseCompiler) { map->set_callee_saved(SharedInfo::stack2reg(ebp_off), framesize, 0, OptoReg::Name(EBP_num)); map->set_callee_saved(SharedInfo::stack2reg(edi_off), framesize, 0, OptoReg::Name(EDI_num)); map->set_callee_saved(SharedInfo::stack2reg(esi_off), framesize, 0, OptoReg::Name(ESI_num)); if( OptoRuntimeCalleeSavedFloats ) { map->set_callee_saved(SharedInfo::stack2reg(xmm6_off ), framesize, 0, OptoReg::Name(XMM6a_num)); map->set_callee_saved(SharedInfo::stack2reg(xmm6_off+1), framesize, 0, OptoReg::Name(XMM6b_num)); map->set_callee_saved(SharedInfo::stack2reg(xmm7_off ), framesize, 0, OptoReg::Name(XMM7a_num)); map->set_callee_saved(SharedInfo::stack2reg(xmm7_off+1), framesize, 0, OptoReg::Name(XMM7b_num)); } } #endif #ifdef COMPILER1 map->set_callee_saved(OptoReg::Name(SharedInfo::stack0+ebp_off), framesize, 0, OptoReg::Name(ebp->encoding())); map->set_callee_saved(OptoReg::Name(SharedInfo::stack0+esi_off), framesize, 0, OptoReg::Name(esi->encoding())); map->set_callee_saved(OptoReg::Name(SharedInfo::stack0+edi_off), framesize, 0, OptoReg::Name(edi->encoding())); #endif oop_maps->add_gc_map(__ pc() - start, true, map); // restore the thread (cannot use the pushed argument since arguments // may be overwritten by C code generated by an optimizing compiler); // however can use the register value directly if it is callee saved. __ get_thread(java_thread); __ reset_last_Java_frame(java_thread, false); // Restore callee save registers. This must be done after resetting the Java frame #ifdef COMPILER2 if( OptoRuntimeCalleeSavedFloats ) { if( UseSSE == 1 ) { __ movss(xmm6,Address(esp,xmm6_off*wordSize)); __ movss(xmm7,Address(esp,xmm7_off*wordSize)); } else if( UseSSE == 2 ) { __ movsd(xmm6,Address(esp,xmm6_off*wordSize)); __ movsd(xmm7,Address(esp,xmm7_off*wordSize)); } } #endif /* COMPILER2 */ __ movl(ebp,Address(esp, ebp_off * wordSize)); __ movl(edi,Address(esp, edi_off * wordSize)); __ movl(esi,Address(esp, esi_off * wordSize)); // discard arguments __ addl(esp, framesize * wordSize); // epilog #ifndef COMPILER2 __ leave(); // required for proper stackwalking of RuntimeStub frame #endif COMPILER2 // check for pending exceptions #ifdef ASSERT Label L; __ cmpl(Address(java_thread, Thread::pending_exception_offset()), (int)NULL); __ jcc(Assembler::notEqual, L); __ should_not_reach_here(); __ bind(L); #endif ASSERT __ jmp(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type); // Note: it seems the frame size reported to the RuntimeStub has // to be incremented by 1 to account for the return PC. It // definitely must be one more than the amount by which SP was // decremented. int extra_words = 1; #ifdef COMPILER1 ++extra_words; // Not strictly necessary since C1 ignores frame size and uses link #endif COMPILER1 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, code, framesize + extra_words, oop_maps, false); return stub->entry_point(); }
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) { if(!m_sel.zb) { return; } // int za = fza_base.y + fza_offset->y; mov(ebp, dword[esi + 4]); add(ebp, dword[edi + 4]); // GSVector4i zs = zi; if(!m_sel.sprite) { if(m_sel.zoverflow) { // zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); static float half = 0.5f; movss(temp1, dword[&half]); shufps(temp1, temp1, _MM_SHUFFLE(0, 0, 0, 0)); mulps(temp1, xmm0); cvttps2dq(temp1, temp1); pslld(temp1, 1); cvttps2dq(xmm0, xmm0); pcmpeqd(temp2, temp2); psrld(temp2, 31); pand(xmm0, temp2); por(xmm0, temp1); } else { // zs = GSVector4i(z); cvttps2dq(xmm0, xmm0); } if(m_sel.zwrite) { movdqa(xmmword[&m_env.temp.zs], xmm0); } } if(m_sel.ztest) { ReadPixel(xmm1, ebp); if(m_sel.zwrite && m_sel.zpsm < 2) { movdqa(xmmword[&m_env.temp.zd], xmm1); } // zd &= 0xffffffff >> m_sel.zpsm * 8; if(m_sel.zpsm) { pslld(xmm1, m_sel.zpsm * 8); psrld(xmm1, m_sel.zpsm * 8); } if(m_sel.zoverflow || m_sel.zpsm == 0) { // GSVector4i o = GSVector4i::x80000000(); pcmpeqd(xmm4, xmm4); pslld(xmm4, 31); // GSVector4i zso = zs - o; psubd(xmm0, xmm4); // GSVector4i zdo = zd - o; psubd(xmm1, xmm4); } switch(m_sel.ztst) { case ZTST_GEQUAL: // test |= zso < zdo; // ~(zso >= zdo) pcmpgtd(xmm1, xmm0); por(xmm7, xmm1); break; case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL // test |= zso <= zdo; // ~(zso > zdo) pcmpgtd(xmm0, xmm1); pcmpeqd(xmm4, xmm4); pxor(xmm0, xmm4); por(xmm7, xmm0); break; } alltrue(); } }
float TfirFilter::vec_inner_prod_sse(const float *eax, const float *edi, int ecx) { __m128 xmm3,xmm4,xmm0,xmm1,xmm5,xmm6; xorps (xmm3, xmm3); xorps (xmm4, xmm4); ecx-=8;// sub $8, %%ecx if (ecx<0) goto //jb mul8_skip; mul8_loop: movups (xmm0,eax); movups (xmm1,edi); movups (xmm5,16/sizeof(float)+eax); movups (xmm6,16/sizeof(float)+edi); eax+=32/sizeof(float); edi+=32/sizeof(float); mulps (xmm1,xmm0); mulps (xmm6,xmm5); addps (xmm3,xmm1); addps (xmm4,xmm6); ecx-=8; if (ecx>=0) //jae goto mul8_loop; mul8_skip: addps (xmm3,xmm4); ecx+=4; if (ecx<0) //jl goto mul4_skip; movups (xmm0,eax); movups (xmm1,edi); eax+=16/sizeof(float); edi+=16/sizeof(float); mulps (xmm1, xmm0); addps (xmm3, xmm1); ecx-=4; mul4_skip: ecx+=4; goto cond1; mul1_loop: movss (xmm0,eax); movss (xmm1,edi); eax+=4/sizeof(float); edi+=4/sizeof(float); mulss (xmm1,xmm0); addss (xmm3,xmm1); cond1: ecx-=1; if (ecx>=0) // jae goto mul1_loop; movhlps (xmm4,xmm3); addps (xmm3,xmm4); movaps (xmm4,xmm3); //FIXME: which one? xmm4=_mm_shuffle_ps(xmm4,xmm4,0x55);// shufps $0x55, xmm4, xmm4 // shufps $33, xmm4, xmm4 addss (xmm3, xmm4); float sum; movss (&sum , xmm3); return sum; }