// Find the IndexSet such that modDown to that set of primes makes the // additive term due to rounding into the dominant noise term void Ctxt::findBaseSet(IndexSet& s) const { if (getNoiseVar()<=0.0) { // an empty ciphertext s = context.ctxtPrimes; return; } assert(verifyPrimeSet()); bool halfSize = context.containsSmallPrime(); double curNoise = log(getNoiseVar())/2; double firstNoise = context.logOfPrime(0); double noiseThreshold = log(modSwitchAddedNoiseVar())*0.55; // FIXME: The above should have been 0.5. Making it a bit more means // that we will mod-switch a little less frequently, whether this is // a good thing needs to be tested. // remove special primes, if they are included in this->primeSet s = getPrimeSet(); if (!s.disjointFrom(context.specialPrimes)) { // scale down noise curNoise -= context.logOfProduct(context.specialPrimes); s.remove(context.specialPrimes); } /* We compare below to noiseThreshold+1 rather than to noiseThreshold * to make sure that if you mod-switch down to c.findBaseSet() and * then immediately call c.findBaseSet() again, it will not tell you * to mod-switch further down. Note that mod-switching adds close to * noiseThreshold to the scaled noise, so if the scaled noise was * equal to noiseThreshold then after mod-switchign you would have * roughly twice as much noise. Since we're mesuring the log, it means * that you may have as much as noiseThreshold+log(2), which we round * up to noiseThreshold+1 in the test below. */ if (curNoise<=noiseThreshold+1) return; // no need to mod down // if the first prime in half size, begin by removing it if (halfSize && s.contains(0)) { curNoise -= firstNoise; s.remove(0); } // while noise is larger than threshold, scale down by the next prime while (curNoise>noiseThreshold && !empty(s)) { curNoise -= context.logOfPrime(s.last()); s.remove(s.last()); } // Add 1st prime if s is empty or if this does not increase noise too much if (empty(s) || (!s.contains(0) && curNoise+firstNoise<=noiseThreshold)) { s.insert(0); curNoise += firstNoise; } if (curNoise>noiseThreshold && log_of_ratio()>-0.5) cerr << "Ctxt::findBaseSet warning: already at lowest level\n"; }
// Find the IndexSet such that modDown to that set of primes makes the // additive term due to rounding into the dominant noise term void Ctxt::findBaseSet(IndexSet& s) const { if (getNoiseVar()<=0.0) { // an empty ciphertext s = context.ctxtPrimes; return; } assert(verifyPrimeSet()); bool halfSize = context.containsSmallPrime(); double addedNoise = log(modSwitchAddedNoiseVar())/2; double curNoise = log(getNoiseVar())/2; double firstNoise = context.logOfPrime(0); // remove special primes, if they are included in this->primeSet s = getPrimeSet(); if (!s.disjointFrom(context.specialPrimes)) { // scale down noise curNoise -= context.logOfProduct(context.specialPrimes); s.remove(context.specialPrimes); } if (curNoise<=2*addedNoise) return; // no need to mod down // if the first prime in half size, begin by removing it if (halfSize && s.contains(0)) { curNoise -= firstNoise; s.remove(0); } // while noise is larger than added term, scale down by the next prime while (curNoise>addedNoise && card(s)>1) { curNoise -= context.logOfPrime(s.last()); s.remove(s.last()); } if (halfSize) { // If noise is still too big, drop last big prime and insert half-size prime if (curNoise>addedNoise) { curNoise = firstNoise; s = IndexSet(0); } // Otherwise check if you can add back the half-size prime else if (curNoise+firstNoise <= addedNoise) { curNoise += firstNoise; s.insert(0); } } if (curNoise>addedNoise && log_of_ratio()>-0.5) cerr << "Ctxt::findBaseSet warning: already at lowest level\n"; }
void PhaseLive::compute(uint maxlrg) { _maxlrg = maxlrg; _worklist = new (_arena) Block_List(); // Init the sparse live arrays. This data is live on exit from here! // The _live info is the live-out info. _live = (IndexSet*)_arena->Amalloc(sizeof(IndexSet)*_cfg._num_blocks); uint i; for( i=0; i<_cfg._num_blocks; i++ ) { _live[i].initialize(_maxlrg); } // Init the sparse arrays for delta-sets. ResourceMark rm; // Nuke temp storage on exit // Does the memory used by _defs and _deltas get reclaimed? Does it matter? TT // Array of values defined locally in blocks _defs = NEW_RESOURCE_ARRAY(IndexSet,_cfg._num_blocks); for( i=0; i<_cfg._num_blocks; i++ ) { _defs[i].initialize(_maxlrg); } // Array of delta-set pointers, indexed by block pre_order-1. _deltas = NEW_RESOURCE_ARRAY(IndexSet*,_cfg._num_blocks); memset( _deltas, 0, sizeof(IndexSet*)* _cfg._num_blocks); _free_IndexSet = NULL; // Blocks having done pass-1 VectorSet first_pass(Thread::current()->resource_area()); // Outer loop: must compute local live-in sets and push into predecessors. uint iters = _cfg._num_blocks; // stat counters for( uint j=_cfg._num_blocks; j>0; j-- ) { Block *b = _cfg._blocks[j-1]; // Compute the local live-in set. Start with any new live-out bits. IndexSet *use = getset( b ); IndexSet *def = &_defs[b->_pre_order-1]; uint i; for( i=b->_nodes.size(); i>1; i-- ) { Node *n = b->_nodes[i-1]; if( n->is_Phi() ) break; // BoxNodes keep their input alive as long as their uses. If we // see a BoxNode then make its input live to the Root block. // Because we are solving LIVEness, the input now becomes live // over the whole procedure, interferencing with everything else // and getting a private unshared stack slot. YeeeHaw! MachNode *mach = n->is_Mach(); if( mach && mach->ideal_Opcode() == Op_Box ) getset(_cfg._broot)->insert( _names[n->in(1)->_idx] ); uint r = _names[n->_idx]; def->insert( r ); use->remove( r ); uint cnt = n->req(); for( uint k=1; k<cnt; k++ ) { Node *nk = n->in(k); uint nkidx = nk->_idx; if( _cfg._bbs[nkidx] != b ) use->insert( _names[nkidx] ); } } // Remove anything defined by Phis and the block start instruction for( uint k=i; k>0; k-- ) { uint r = _names[b->_nodes[k-1]->_idx]; def->insert( r ); use->remove( r ); } // Push these live-in things to predecessors for( uint l=1; l<b->num_preds(); l++ ) { Block *p = _cfg._bbs[b->pred(l)->_idx]; add_liveout( p, use, first_pass ); // PhiNode uses go in the live-out set of prior blocks. for( uint k=i; k>0; k-- ) add_liveout( p, _names[b->_nodes[k-1]->in(l)->_idx], first_pass ); } freeset( b ); first_pass.set(b->_pre_order); // Inner loop: blocks that picked up new live-out values to be propagated while( _worklist->size() ) { // !!!!! // #ifdef ASSERT iters++; // #endif Block *b = _worklist->pop(); IndexSet *delta = getset(b); assert( delta->count(), "missing delta set" ); // Add new-live-in to predecessors live-out sets for( uint l=1; l<b->num_preds(); l++ ) add_liveout( _cfg._bbs[b->pred(l)->_idx], delta, first_pass ); freeset(b); } // End of while-worklist-not-empty } // End of for-all-blocks-outer-loop // We explicitly clear all of the IndexSets which we are about to release. // This allows us to recycle their internal memory into IndexSet's free list. for( i=0; i<_cfg._num_blocks; i++ ) { _defs[i].clear(); if (_deltas[i]) { // Is this always true? _deltas[i]->clear(); } } IndexSet *free = _free_IndexSet; while (free != NULL) { IndexSet *temp = free; free = free->next(); temp->clear(); } }
void fixPartialRegisterStalls(Code& code) { if (!isX86()) return; PhaseScope phaseScope(code, "fixPartialRegisterStalls"); Vector<BasicBlock*> candidates; for (BasicBlock* block : code) { for (const Inst& inst : *block) { if (hasPartialXmmRegUpdate(inst)) { candidates.append(block); break; } } } // Fortunately, Partial Stalls are rarely used. Return early if no block // cares about them. if (candidates.isEmpty()) return; // For each block, this provides the distance to the last instruction setting each register // on block *entry*. IndexMap<BasicBlock, FPDefDistance> lastDefDistance(code.size()); // Blocks with dirty distance at head. IndexSet<BasicBlock> dirty; // First, we compute the local distance for each block and push it to the successors. for (BasicBlock* block : code) { FPDefDistance localDistance; unsigned distanceToBlockEnd = block->size(); for (Inst& inst : *block) updateDistances(inst, localDistance, distanceToBlockEnd); for (BasicBlock* successor : block->successorBlocks()) { if (lastDefDistance[successor].updateFromPrecessor(localDistance)) dirty.add(successor); } } // Now we propagate the minimums accross blocks. bool changed; do { changed = false; for (BasicBlock* block : code) { if (!dirty.remove(block)) continue; // Little shortcut: if the block is big enough, propagating it won't add any information. if (block->size() >= minimumSafeDistance) continue; unsigned blockSize = block->size(); FPDefDistance& blockDistance = lastDefDistance[block]; for (BasicBlock* successor : block->successorBlocks()) { if (lastDefDistance[successor].updateFromPrecessor(blockDistance, blockSize)) { dirty.add(successor); changed = true; } } } } while (changed); // Finally, update each block as needed. InsertionSet insertionSet(code); for (BasicBlock* block : candidates) { unsigned distanceToBlockEnd = block->size(); FPDefDistance& localDistance = lastDefDistance[block]; for (unsigned i = 0; i < block->size(); ++i) { Inst& inst = block->at(i); if (hasPartialXmmRegUpdate(inst)) { RegisterSet defs; RegisterSet uses; inst.forEachTmp([&] (Tmp& tmp, Arg::Role role, Arg::Type) { if (tmp.isFPR()) { if (Arg::isDef(role)) defs.set(tmp.fpr()); if (Arg::isAnyUse(role)) uses.set(tmp.fpr()); } }); // We only care about values we define but not use. Otherwise we have to wait // for the value to be resolved anyway. defs.exclude(uses); defs.forEach([&] (Reg reg) { if (localDistance.distance[MacroAssembler::fpRegisterIndex(reg.fpr())] < minimumSafeDistance) insertionSet.insert(i, MoveZeroToDouble, inst.origin, Tmp(reg)); }); } updateDistances(inst, localDistance, distanceToBlockEnd); } insertionSet.execute(block); } }
void adjustLevelForMult(Ctxt& c1, const char name1[], const ZZX& p1, Ctxt& c2, const char name2[], const ZZX& p2, const FHESecKey& sk) { const FHEcontext& context = c1.getContext(); // The highest possible level for this multiplication is the // intersection of the two primeSets, without the special primes. IndexSet primes = c1.getPrimeSet() & c2.getPrimeSet(); primes.remove(context.specialPrimes); assert (!empty(primes)); // double phim = (double) context.zMstar.phiM(); // double factor = c_m*sqrt(log(phim))*4; xdouble n1,n2,d1,d2; xdouble dvar1 = c1.modSwitchAddedNoiseVar(); xdouble dvar2 = c2.modSwitchAddedNoiseVar(); // xdouble dmag1 = c1.modSwitchAddedNoiseMag(c_m); // xdouble dmag2 = c2.modSwitchAddedNoiseMag(c_m); // cout << " ** log(dvar1)=" << log(dvar1) // << ", log(dvar2)=" << log(dvar2) <<endl; double logF1, logF2; xdouble n1var, n2var, modSize; // n1mag, n2mag, // init to large number xdouble noiseVarRatio=xexp(2*(context.logOfProduct(context.ctxtPrimes) + context.logOfProduct(context.specialPrimes))); // xdouble noiseMagRatio=noiseVarRatio; // Find the level that minimizes the noise-to-modulus ratio bool oneLevelMore = false; for (IndexSet levelDown = primes; !empty(levelDown); levelDown.remove(levelDown.last())) { // compute noise variane/magnitude after mod-switchign to this level logF1 = context.logOfProduct(c1.getPrimeSet() / levelDown); n1var = c1.getNoiseVar()/xexp(2*logF1); logF2 = context.logOfProduct(c2.getPrimeSet() / levelDown); n2var = c2.getNoiseVar()/xexp(2*logF2); // compute modulus/noise ratio at this level modSize = xexp(context.logOfProduct(levelDown)); xdouble nextNoiseVarRatio = sqrt((n1var+dvar1)*(n2var+dvar2))/modSize; if (nextNoiseVarRatio < 2*noiseVarRatio || oneLevelMore) { noiseVarRatio = nextNoiseVarRatio; primes = levelDown; // record the currently best prime set n1 = n1var; d1=dvar1; n2 = n2var; d2=dvar2; } oneLevelMore = (n1var > dvar1 || n2var > dvar2); } if (primes < c1.getPrimeSet()) { cout << " ** " << c1.getPrimeSet()<<"=>"<<primes << endl; cout << " n1var="<<n1<<", d1var="<<d1<<endl;; c1.modDownToSet(primes); cout << name1 << ".mDown:"; checkCiphertext(c1, p1, sk); } if (primes < c2.getPrimeSet()) { cout << " ** " << c2.getPrimeSet()<<"=>"<<primes << endl; cout << " n2var="<<n2<<", d2var="<<d2<<endl;; c2.modDownToSet(primes); cout << name2 << ".mDown:"; checkCiphertext(c2, p2, sk); } }
//------------------------------insert_copies---------------------------------- void PhaseAggressiveCoalesce::insert_copies( Matcher &matcher ) { // We do LRGs compressing and fix a liveout data only here since the other // place in Split() is guarded by the assert which we never hit. _phc.compress_uf_map_for_nodes(); // Fix block's liveout data for compressed live ranges. for(uint lrg = 1; lrg < _phc._maxlrg; lrg++ ) { uint compressed_lrg = _phc.Find(lrg); if( lrg != compressed_lrg ) { for( uint bidx = 0; bidx < _phc._cfg._num_blocks; bidx++ ) { IndexSet *liveout = _phc._live->live(_phc._cfg._blocks[bidx]); if( liveout->member(lrg) ) { liveout->remove(lrg); liveout->insert(compressed_lrg); } } } } // All new nodes added are actual copies to replace virtual copies. // Nodes with index less than '_unique' are original, non-virtual Nodes. _unique = C->unique(); for( uint i=0; i<_phc._cfg._num_blocks; i++ ) { Block *b = _phc._cfg._blocks[i]; uint cnt = b->num_preds(); // Number of inputs to the Phi for( uint l = 1; l<b->_nodes.size(); l++ ) { Node *n = b->_nodes[l]; // Do not use removed-copies, use copied value instead uint ncnt = n->req(); for( uint k = 1; k<ncnt; k++ ) { Node *copy = n->in(k); uint cidx = copy->is_Copy(); if( cidx ) { Node *def = copy->in(cidx); if( _phc.Find(copy) == _phc.Find(def) ) n->set_req(k,def); } } // Remove any explicit copies that get coalesced. uint cidx = n->is_Copy(); if( cidx ) { Node *def = n->in(cidx); if( _phc.Find(n) == _phc.Find(def) ) { n->replace_by(def); n->set_req(cidx,NULL); b->_nodes.remove(l); l--; continue; } } if( n->is_Phi() ) { // Get the chosen name for the Phi uint phi_name = _phc.Find( n ); // Ignore the pre-allocated specials if( !phi_name ) continue; // Check for mismatch inputs to Phi for( uint j = 1; j<cnt; j++ ) { Node *m = n->in(j); uint src_name = _phc.Find(m); if( src_name != phi_name ) { Block *pred = _phc._cfg._bbs[b->pred(j)->_idx]; Node *copy; assert(!m->is_Con() || m->is_Mach(), "all Con must be Mach"); // Rematerialize constants instead of copying them if( m->is_Mach() && m->as_Mach()->is_Con() && m->as_Mach()->rematerialize() ) { copy = m->clone(); // Insert the copy in the predecessor basic block pred->add_inst(copy); // Copy any flags as well _phc.clone_projs( pred, pred->end_idx(), m, copy, _phc._maxlrg ); } else { const RegMask *rm = C->matcher()->idealreg2spillmask[m->ideal_reg()]; copy = new (C) MachSpillCopyNode(m,*rm,*rm); // Find a good place to insert. Kinda tricky, use a subroutine insert_copy_with_overlap(pred,copy,phi_name,src_name); } // Insert the copy in the use-def chain n->set_req( j, copy ); _phc._cfg._bbs.map( copy->_idx, pred ); // Extend ("register allocate") the names array for the copy. _phc._names.extend( copy->_idx, phi_name ); } // End of if Phi names do not match } // End of for all inputs to Phi } else { // End of if Phi // Now check for 2-address instructions uint idx; if( n->is_Mach() && (idx=n->as_Mach()->two_adr()) ) { // Get the chosen name for the Node uint name = _phc.Find( n ); assert( name, "no 2-address specials" ); // Check for name mis-match on the 2-address input Node *m = n->in(idx); if( _phc.Find(m) != name ) { Node *copy; assert(!m->is_Con() || m->is_Mach(), "all Con must be Mach"); // At this point it is unsafe to extend live ranges (6550579). // Rematerialize only constants as we do for Phi above. if( m->is_Mach() && m->as_Mach()->is_Con() && m->as_Mach()->rematerialize() ) { copy = m->clone(); // Insert the copy in the basic block, just before us b->_nodes.insert( l++, copy ); if( _phc.clone_projs( b, l, m, copy, _phc._maxlrg ) ) l++; } else { const RegMask *rm = C->matcher()->idealreg2spillmask[m->ideal_reg()]; copy = new (C) MachSpillCopyNode( m, *rm, *rm ); // Insert the copy in the basic block, just before us b->_nodes.insert( l++, copy ); } // Insert the copy in the use-def chain n->set_req(idx, copy ); // Extend ("register allocate") the names array for the copy. _phc._names.extend( copy->_idx, name ); _phc._cfg._bbs.map( copy->_idx, b ); } } // End of is two-adr // Insert a copy at a debug use for a lrg which has high frequency if( b->_freq < OPTO_DEBUG_SPLIT_FREQ || b->is_uncommon(_phc._cfg._bbs) ) { // Walk the debug inputs to the node and check for lrg freq JVMState* jvms = n->jvms(); uint debug_start = jvms ? jvms->debug_start() : 999999; uint debug_end = jvms ? jvms->debug_end() : 999999; for(uint inpidx = debug_start; inpidx < debug_end; inpidx++) { // Do not split monitors; they are only needed for debug table // entries and need no code. if( jvms->is_monitor_use(inpidx) ) continue; Node *inp = n->in(inpidx); uint nidx = _phc.n2lidx(inp); LRG &lrg = lrgs(nidx); // If this lrg has a high frequency use/def if( lrg._maxfreq >= _phc.high_frequency_lrg() ) { // If the live range is also live out of this block (like it // would be for a fast/slow idiom), the normal spill mechanism // does an excellent job. If it is not live out of this block // (like it would be for debug info to uncommon trap) splitting // the live range now allows a better allocation in the high // frequency blocks. // Build_IFG_virtual has converted the live sets to // live-IN info, not live-OUT info. uint k; for( k=0; k < b->_num_succs; k++ ) if( _phc._live->live(b->_succs[k])->member( nidx ) ) break; // Live in to some successor block? if( k < b->_num_succs ) continue; // Live out; do not pre-split // Split the lrg at this use const RegMask *rm = C->matcher()->idealreg2spillmask[inp->ideal_reg()]; Node *copy = new (C) MachSpillCopyNode( inp, *rm, *rm ); // Insert the copy in the use-def chain n->set_req(inpidx, copy ); // Insert the copy in the basic block, just before us b->_nodes.insert( l++, copy ); // Extend ("register allocate") the names array for the copy. _phc.new_lrg( copy, _phc._maxlrg++ ); _phc._cfg._bbs.map( copy->_idx, b ); //tty->print_cr("Split a debug use in Aggressive Coalesce"); } // End of if high frequency use/def } // End of for all debug inputs } // End of if low frequency safepoint } // End of if Phi } // End of for all instructions } // End of for all blocks }
//------------------------------build_ifg_virtual------------------------------ // Actually build the interference graph. Uses virtual registers only, no // physical register masks. This allows me to be very aggressive when // coalescing copies. Some of this aggressiveness will have to be undone // later, but I'd rather get all the copies I can now (since unremoved copies // at this point can end up in bad places). Copies I re-insert later I have // more opportunity to insert them in low-frequency locations. void PhaseChaitin::build_ifg_virtual( ) { // For all blocks (in any order) do... for( uint i=0; i<_cfg._num_blocks; i++ ) { Block *b = _cfg._blocks[i]; IndexSet *liveout = _live->live(b); // The IFG is built by a single reverse pass over each basic block. // Starting with the known live-out set, we remove things that get // defined and add things that become live (essentially executing one // pass of a standard LIVE analysis). Just before a Node defines a value // (and removes it from the live-ness set) that value is certainly live. // The defined value interferes with everything currently live. The // value is then removed from the live-ness set and it's inputs are // added to the live-ness set. for( uint j = b->end_idx() + 1; j > 1; j-- ) { Node *n = b->_nodes[j-1]; // Get value being defined uint r = n2lidx(n); // Some special values do not allocate if( r ) { // Remove from live-out set liveout->remove(r); // Copies do not define a new value and so do not interfere. // Remove the copies source from the liveout set before interfering. uint idx = n->is_Copy(); if( idx ) liveout->remove( n2lidx(n->in(idx)) ); // Interfere with everything live interfere_with_live( r, liveout ); } // Make all inputs live if( !n->is_Phi() ) { // Phi function uses come from prior block for( uint k = 1; k < n->req(); k++ ) liveout->insert( n2lidx(n->in(k)) ); } // 2-address instructions always have the defined value live // on entry to the instruction, even though it is being defined // by the instruction. We pretend a virtual copy sits just prior // to the instruction and kills the src-def'd register. // In other words, for 2-address instructions the defined value // interferes with all inputs. uint idx; if( n->is_Mach() && (idx = n->as_Mach()->two_adr()) ) { const MachNode *mach = n->as_Mach(); // Sometimes my 2-address ADDs are commuted in a bad way. // We generally want the USE-DEF register to refer to the // loop-varying quantity, to avoid a copy. uint op = mach->ideal_Opcode(); // Check that mach->num_opnds() == 3 to ensure instruction is // not subsuming constants, effectively excludes addI_cin_imm // Can NOT swap for instructions like addI_cin_imm since it // is adding zero to yhi + carry and the second ideal-input // points to the result of adding low-halves. // Checking req() and num_opnds() does NOT distinguish addI_cout from addI_cout_imm if( (op == Op_AddI && mach->req() == 3 && mach->num_opnds() == 3) && n->in(1)->bottom_type()->base() == Type::Int && // See if the ADD is involved in a tight data loop the wrong way n->in(2)->is_Phi() && n->in(2)->in(2) == n ) { Node *tmp = n->in(1); n->set_req( 1, n->in(2) ); n->set_req( 2, tmp ); } // Defined value interferes with all inputs uint lidx = n2lidx(n->in(idx)); for( uint k = 1; k < n->req(); k++ ) { uint kidx = n2lidx(n->in(k)); if( kidx != lidx ) _ifg->add_edge( r, kidx ); } } } // End of forall instructions in block } // End of forall blocks }
// set minus IndexSet operator/(const IndexSet& s, const IndexSet& t) { IndexSet r = s; r.remove(t); return r; }
// exclusive-or IndexSet operator^(const IndexSet& s, const IndexSet& t) { IndexSet r = s | t; r.remove(s & t); return r; }