//------------------------------is_uncommon------------------------------------ // True if block is low enough frequency or guarded by a test which // mostly does not go here. bool Block::is_uncommon( Block_Array &bbs ) const { // Initial blocks must never be moved, so are never uncommon. if (head()->is_Root() || head()->is_Start()) return false; // Check for way-low freq if( _freq < BLOCK_FREQUENCY(0.00001f) ) return true; // Look for code shape indicating uncommon_trap or slow path if (has_uncommon_code()) return true; const float epsilon = 0.05f; const float guard_factor = PROB_UNLIKELY_MAG(4) / (1.f - epsilon); uint uncommon_preds = 0; uint freq_preds = 0; uint uncommon_for_freq_preds = 0; for( uint i=1; i<num_preds(); i++ ) { Block* guard = bbs[pred(i)->_idx]; // Check to see if this block follows its guard 1 time out of 10000 // or less. // // See list of magnitude-4 unlikely probabilities in cfgnode.hpp which // we intend to be "uncommon", such as slow-path TLE allocation, // predicted call failure, and uncommon trap triggers. // // Use an epsilon value of 5% to allow for variability in frequency // predictions and floating point calculations. The net effect is // that guard_factor is set to 9500. // // Ignore low-frequency blocks. // The next check is (guard->_freq < 1.e-5 * 9500.). if(guard->_freq*BLOCK_FREQUENCY(guard_factor) < BLOCK_FREQUENCY(0.00001f)) { uncommon_preds++; } else { freq_preds++; if( _freq < guard->_freq * guard_factor ) { uncommon_for_freq_preds++; } } } if( num_preds() > 1 && // The block is uncommon if all preds are uncommon or (uncommon_preds == (num_preds()-1) || // it is uncommon for all frequent preds. uncommon_for_freq_preds == freq_preds) ) { return true; } return false; }
//------------------------------expand_safepoint_node---------------------- void PhaseMacroExpand::expand_safepoint_node(SafePointNode *safe) { // Make a fast-path/slow-path diamond around the explicit poll Node*tls=new(C,1)ThreadLocalNode(); transform_later(tls); ConLNode *off = _igvn.longcon(in_bytes(JavaThread::please_self_suspend_offset())); Node *adr = new (C, 4) AddPNode( C->top(), tls, off ); transform_later(adr); Node *pss = LoadNode::make(C, NULL, safe->in(TypeFunc::Memory), adr, TypeRawPtr::BOTTOM, TypeInt::INT, T_INT); transform_later(pss); Node *cmp = new (C, 3) CmpINode( pss, _igvn.intcon(0) ); transform_later(cmp); Node*bol=new(C,2)BoolNode(cmp,BoolTest::ne); transform_later(bol); Node *region = new (C, 3) RegionNode(3); transform_later(region); IfNode*iff= new (C, 2) IfNode(safe->in(TypeFunc::Control), bol, PROB_UNLIKELY_MAG(6), COUNT_UNKNOWN ); Node *slow_path = opt_iff(region,iff); _igvn.hash_delete(safe); safe->set_req(TypeFunc::Control,slow_path); _igvn.subsume_node_keep_old(safe,region); region->init_req(1,safe); }
//------------------------------implicit_null_check---------------------------- // Detect implicit-null-check opportunities. Basically, find NULL checks // with suitable memory ops nearby. Use the memory op to do the NULL check. // I can generate a memory op if there is not one nearby. // The proj is the control projection for the not-null case. // The val is the pointer being checked for nullness or // decodeHeapOop_not_null node if it did not fold into address. void PhaseCFG::implicit_null_check(Block* block, Node *proj, Node *val, int allowed_reasons) { // Assume if null check need for 0 offset then always needed // Intel solaris doesn't support any null checks yet and no // mechanism exists (yet) to set the switches at an os_cpu level if( !ImplicitNullChecks || MacroAssembler::needs_explicit_null_check(0)) return; // Make sure the ptr-is-null path appears to be uncommon! float f = block->end()->as_MachIf()->_prob; if( proj->Opcode() == Op_IfTrue ) f = 1.0f - f; if( f > PROB_UNLIKELY_MAG(4) ) return; uint bidx = 0; // Capture index of value into memop bool was_store; // Memory op is a store op // Get the successor block for if the test ptr is non-null Block* not_null_block; // this one goes with the proj Block* null_block; if (block->get_node(block->number_of_nodes()-1) == proj) { null_block = block->_succs[0]; not_null_block = block->_succs[1]; } else { assert(block->get_node(block->number_of_nodes()-2) == proj, "proj is one or the other"); not_null_block = block->_succs[0]; null_block = block->_succs[1]; } while (null_block->is_Empty() == Block::empty_with_goto) { null_block = null_block->_succs[0]; } // Search the exception block for an uncommon trap. // (See Parse::do_if and Parse::do_ifnull for the reason // we need an uncommon trap. Briefly, we need a way to // detect failure of this optimization, as in 6366351.) { bool found_trap = false; for (uint i1 = 0; i1 < null_block->number_of_nodes(); i1++) { Node* nn = null_block->get_node(i1); if (nn->is_MachCall() && nn->as_MachCall()->entry_point() == SharedRuntime::uncommon_trap_blob()->entry_point()) { const Type* trtype = nn->in(TypeFunc::Parms)->bottom_type(); if (trtype->isa_int() && trtype->is_int()->is_con()) { jint tr_con = trtype->is_int()->get_con(); Deoptimization::DeoptReason reason = Deoptimization::trap_request_reason(tr_con); Deoptimization::DeoptAction action = Deoptimization::trap_request_action(tr_con); assert((int)reason < (int)BitsPerInt, "recode bit map"); if (is_set_nth_bit(allowed_reasons, (int) reason) && action != Deoptimization::Action_none) { // This uncommon trap is sure to recompile, eventually. // When that happens, C->too_many_traps will prevent // this transformation from happening again. found_trap = true; } } break; } } if (!found_trap) { // We did not find an uncommon trap. return; } } // Check for decodeHeapOop_not_null node which did not fold into address bool is_decoden = ((intptr_t)val) & 1; val = (Node*)(((intptr_t)val) & ~1); assert(!is_decoden || (val->in(0) == NULL) && val->is_Mach() && (val->as_Mach()->ideal_Opcode() == Op_DecodeN), "sanity"); // Search the successor block for a load or store who's base value is also // the tested value. There may be several. Node_List *out = new Node_List(Thread::current()->resource_area()); MachNode *best = NULL; // Best found so far for (DUIterator i = val->outs(); val->has_out(i); i++) { Node *m = val->out(i); if( !m->is_Mach() ) continue; MachNode *mach = m->as_Mach(); was_store = false; int iop = mach->ideal_Opcode(); switch( iop ) { case Op_LoadB: case Op_LoadUB: case Op_LoadUS: case Op_LoadD: case Op_LoadF: case Op_LoadI: case Op_LoadL: case Op_LoadP: case Op_LoadN: case Op_LoadS: case Op_LoadKlass: case Op_LoadNKlass: case Op_LoadRange: case Op_LoadD_unaligned: case Op_LoadL_unaligned: assert(mach->in(2) == val, "should be address"); break; case Op_StoreB: case Op_StoreC: case Op_StoreCM: case Op_StoreD: case Op_StoreF: case Op_StoreI: case Op_StoreL: case Op_StoreP: case Op_StoreN: case Op_StoreNKlass: was_store = true; // Memory op is a store op // Stores will have their address in slot 2 (memory in slot 1). // If the value being nul-checked is in another slot, it means we // are storing the checked value, which does NOT check the value! if( mach->in(2) != val ) continue; break; // Found a memory op? case Op_StrComp: case Op_StrEquals: case Op_StrIndexOf: case Op_AryEq: case Op_EncodeISOArray: // Not a legit memory op for implicit null check regardless of // embedded loads continue; default: // Also check for embedded loads if( !mach->needs_anti_dependence_check() ) continue; // Not an memory op; skip it if( must_clone[iop] ) { // Do not move nodes which produce flags because // RA will try to clone it to place near branch and // it will cause recompilation, see clone_node(). continue; } { // Check that value is used in memory address in // instructions with embedded load (CmpP val1,(val2+off)). Node* base; Node* index; const MachOper* oper = mach->memory_inputs(base, index); if (oper == NULL || oper == (MachOper*)-1) { continue; // Not an memory op; skip it } if (val == base || val == index && val->bottom_type()->isa_narrowoop()) { break; // Found it } else { continue; // Skip it } } break; } // check if the offset is not too high for implicit exception { intptr_t offset = 0; const TypePtr *adr_type = NULL; // Do not need this return value here const Node* base = mach->get_base_and_disp(offset, adr_type); if (base == NULL || base == NodeSentinel) { // Narrow oop address doesn't have base, only index if( val->bottom_type()->isa_narrowoop() && MacroAssembler::needs_explicit_null_check(offset) ) continue; // Give up if offset is beyond page size // cannot reason about it; is probably not implicit null exception } else { const TypePtr* tptr; if (UseCompressedOops && (Universe::narrow_oop_shift() == 0 || Universe::narrow_klass_shift() == 0)) { // 32-bits narrow oop can be the base of address expressions tptr = base->get_ptr_type(); } else { // only regular oops are expected here tptr = base->bottom_type()->is_ptr(); } // Give up if offset is not a compile-time constant if( offset == Type::OffsetBot || tptr->_offset == Type::OffsetBot ) continue; offset += tptr->_offset; // correct if base is offseted if( MacroAssembler::needs_explicit_null_check(offset) ) continue; // Give up is reference is beyond 4K page size } } // Check ctrl input to see if the null-check dominates the memory op Block *cb = get_block_for_node(mach); cb = cb->_idom; // Always hoist at least 1 block if( !was_store ) { // Stores can be hoisted only one block while( cb->_dom_depth > (block->_dom_depth + 1)) cb = cb->_idom; // Hoist loads as far as we want // The non-null-block should dominate the memory op, too. Live // range spilling will insert a spill in the non-null-block if it is // needs to spill the memory op for an implicit null check. if (cb->_dom_depth == (block->_dom_depth + 1)) { if (cb != not_null_block) continue; cb = cb->_idom; } } if( cb != block ) continue; // Found a memory user; see if it can be hoisted to check-block uint vidx = 0; // Capture index of value into memop uint j; for( j = mach->req()-1; j > 0; j-- ) { if( mach->in(j) == val ) { vidx = j; // Ignore DecodeN val which could be hoisted to where needed. if( is_decoden ) continue; } // Block of memory-op input Block *inb = get_block_for_node(mach->in(j)); Block *b = block; // Start from nul check while( b != inb && b->_dom_depth > inb->_dom_depth ) b = b->_idom; // search upwards for input // See if input dominates null check if( b != inb ) break; } if( j > 0 ) continue; Block *mb = get_block_for_node(mach); // Hoisting stores requires more checks for the anti-dependence case. // Give up hoisting if we have to move the store past any load. if( was_store ) { Block *b = mb; // Start searching here for a local load // mach use (faulting) trying to hoist // n might be blocker to hoisting while( b != block ) { uint k; for( k = 1; k < b->number_of_nodes(); k++ ) { Node *n = b->get_node(k); if( n->needs_anti_dependence_check() && n->in(LoadNode::Memory) == mach->in(StoreNode::Memory) ) break; // Found anti-dependent load } if( k < b->number_of_nodes() ) break; // Found anti-dependent load // Make sure control does not do a merge (would have to check allpaths) if( b->num_preds() != 2 ) break; b = get_block_for_node(b->pred(1)); // Move up to predecessor block } if( b != block ) continue; } // Make sure this memory op is not already being used for a NullCheck Node *e = mb->end(); if( e->is_MachNullCheck() && e->in(1) == mach ) continue; // Already being used as a NULL check // Found a candidate! Pick one with least dom depth - the highest // in the dom tree should be closest to the null check. if (best == NULL || get_block_for_node(mach)->_dom_depth < get_block_for_node(best)->_dom_depth) { best = mach; bidx = vidx; } } // No candidate! if (best == NULL) { return; } // ---- Found an implicit null check extern int implicit_null_checks; implicit_null_checks++; if( is_decoden ) { // Check if we need to hoist decodeHeapOop_not_null first. Block *valb = get_block_for_node(val); if( block != valb && block->_dom_depth < valb->_dom_depth ) { // Hoist it up to the end of the test block. valb->find_remove(val); block->add_inst(val); map_node_to_block(val, block); // DecodeN on x86 may kill flags. Check for flag-killing projections // that also need to be hoisted. for (DUIterator_Fast jmax, j = val->fast_outs(jmax); j < jmax; j++) { Node* n = val->fast_out(j); if( n->is_MachProj() ) { get_block_for_node(n)->find_remove(n); block->add_inst(n); map_node_to_block(n, block); } } } } // Hoist the memory candidate up to the end of the test block. Block *old_block = get_block_for_node(best); old_block->find_remove(best); block->add_inst(best); map_node_to_block(best, block); // Move the control dependence if (best->in(0) && best->in(0) == old_block->head()) best->set_req(0, block->head()); // Check for flag-killing projections that also need to be hoisted // Should be DU safe because no edge updates. for (DUIterator_Fast jmax, j = best->fast_outs(jmax); j < jmax; j++) { Node* n = best->fast_out(j); if( n->is_MachProj() ) { get_block_for_node(n)->find_remove(n); block->add_inst(n); map_node_to_block(n, block); } } // proj==Op_True --> ne test; proj==Op_False --> eq test. // One of two graph shapes got matched: // (IfTrue (If (Bool NE (CmpP ptr NULL)))) // (IfFalse (If (Bool EQ (CmpP ptr NULL)))) // NULL checks are always branch-if-eq. If we see a IfTrue projection // then we are replacing a 'ne' test with a 'eq' NULL check test. // We need to flip the projections to keep the same semantics. if( proj->Opcode() == Op_IfTrue ) { // Swap order of projections in basic block to swap branch targets Node *tmp1 = block->get_node(block->end_idx()+1); Node *tmp2 = block->get_node(block->end_idx()+2); block->map_node(tmp2, block->end_idx()+1); block->map_node(tmp1, block->end_idx()+2); Node *tmp = new (C) Node(C->top()); // Use not NULL input tmp1->replace_by(tmp); tmp2->replace_by(tmp1); tmp->replace_by(tmp2); tmp->destruct(); } // Remove the existing null check; use a new implicit null check instead. // Since schedule-local needs precise def-use info, we need to correct // it as well. Node *old_tst = proj->in(0); MachNode *nul_chk = new (C) MachNullCheckNode(old_tst->in(0),best,bidx); block->map_node(nul_chk, block->end_idx()); map_node_to_block(nul_chk, block); // Redirect users of old_test to nul_chk for (DUIterator_Last i2min, i2 = old_tst->last_outs(i2min); i2 >= i2min; --i2) old_tst->last_out(i2)->set_req(0, nul_chk); // Clean-up any dead code for (uint i3 = 0; i3 < old_tst->req(); i3++) old_tst->set_req(i3, NULL); latency_from_uses(nul_chk); latency_from_uses(best); }
//--- expand_allocation ------------------------------------------------------ void PhaseMacroExpand::expand_allocate(AllocateNode*A) { // See if we are forced to go slow-path. if( A->_entry_point == (address)SharedRuntime::_new ) return; // Always go slow-path - required for finalizers, etc Node*A_ctl=A->proj_out(TypeFunc::Control); Node*A_mem=A->proj_out(TypeFunc::Memory); Node*A_oop=A->proj_out(TypeFunc::Parms+0); // Inject a fast-path / slow-path diamond. Fast-path is still milli-code, // which returns an oop or null - and does not block, nor GC, nor kill // registers. If we have an allocation failure the fast-path leaves the // regs in-place for a slow-path call which DOES block, GC, etc. Node*ctl=A->in(TypeFunc::Control); Node*kid=A->in(AllocateNode::KID); Node*siz=A->in(AllocateNode::AllocSize); Node*xtr=A->in(AllocateNode::ExtraSlow); Node *len = A->is_AllocateArray() ? A->in(AllocateNode::ALength) : C->top(); if ( !A_oop && ( !A->is_AllocateArray() || ( _igvn.type(A->in(AllocateNode::ALength))->higher_equal(TypeInt::POS) ) ) ) { tty->print_cr("Dead allocation should be removed earlier"); Unimplemented(); } // Convert the array element count to a Long value, // and fold in the EKID value for oop-arrays. const TypeOopPtr *toop = A->_tf->range()->field_at(TypeFunc::Parms+0)->is_ptr()->cast_to_ptr_type(TypePtr::BotPTR)->is_oopptr(); if( len != C->top() ) { assert0( A->is_AllocateArray() ); Node*lenl=transform_later(new(C,2)ConvI2LNode(len)); Node*ekid=A->in(AllocateNode::EKID); if( ekid->bottom_type() != TypeInt::ZERO ) { // Have an EKID? Smash the array length and EKID together. Node *ekidl = transform_later(new (C,2) CastP2XNode(0,ekid)); Node *ekidshf= transform_later(new (C,3) LShiftLNode(ekidl,_igvn.intcon(32))); Node *combo = transform_later(new (C,3) OrLNode(lenl,ekidshf)); len=combo; } else { len=lenl; } // Crunch arguments for matching. The old ExtraSlow argument is used to // make more gating control flow in this function, but is not an argument // to the runtime call. Neither is the EKID argument: the slow-path will // compute it's own EKID. A->set_req(AllocateNode::ExtraSlow,len); A->set_req(AllocateNode::ALength,C->top()); A->set_req(AllocateNode::EKID,C->top()); } else { A->set_req(AllocateNode::ExtraSlow,_igvn.zerocon(T_INT)); assert0( !A->is_AllocateArray() ); } // Extra slow-path test required? RegionNode *region2 = NULL; if( xtr->bottom_type() != TypeInt::ZERO ) { // Commonly, no extra test required // Extra slow-path tests can be required for fast-pathing reflection // allocation (and cloning). If the new object requires e.g. finalization // or further class linking/loading. Node *cmp = transform_later(new(C,3)CmpINode(xtr,_igvn.zerocon(T_INT))); Node*bol=transform_later(new(C,2)BoolNode(cmp,BoolTest::eq)); Node *iff = new (C, 2) IfNode( ctl, bol, PROB_LIKELY_MAG(5), COUNT_UNKNOWN ); region2=new(C,3)RegionNode(3); transform_later(region2); ctl = opt_iff(region2,iff); } FastAllocNode *fal = new (C, 4) FastAllocNode(kid,siz,toop,len); fal->set_req(0,ctl); transform_later(fal); Node *mem = new (C,1) SCMemProjNode(fal); transform_later(mem); Node *cmp = transform_later(new(C,3)CmpPNode(fal,_igvn.zerocon(T_OBJECT))); Node*bol=transform_later(new(C,2)BoolNode(cmp,BoolTest::eq)); Node*iff=new(C,2)IfNode(ctl,bol,PROB_UNLIKELY_MAG(5),COUNT_UNKNOWN); RegionNode *region = new (C, 3) RegionNode(3); transform_later(region); Node *slow_path = opt_iff(region,iff); // Make the merge point PhiNode*phimem=new(C,3)PhiNode(region,Type::MEMORY,TypePtr::BOTTOM); transform_later(phimem); phimem->init_req(2,mem); // Plug in the fast-path PhiNode*phioop=NULL; if (A_oop) { phioop = new (C, 3) PhiNode(region,toop); transform_later(phioop); phioop->init_req(2,fal); // Plug in the fast-path } _igvn.hash_delete(A_ctl); _igvn.hash_delete(A_mem); if (A_oop) _igvn.hash_delete (A_oop); _igvn.subsume_node_keep_old(A_ctl,region); _igvn.subsume_node_keep_old(A_mem,phimem); if (A_oop) _igvn.subsume_node_keep_old(A_oop,phioop); // Plug in the slow-path region->init_req(1,A_ctl); phimem->init_req(1,A_mem); if (A_oop) phioop->init_req(1,A_oop); if( xtr->bottom_type() != TypeInt::ZERO ) { // Commonly, no extra test required region2->init_req(1,slow_path); slow_path = region2; } A->set_req(TypeFunc::Control,slow_path); // The slow-path call now directly calls into the runtime. A->_entry_point = (address)SharedRuntime::_new; }