void DeadCodeEliminationPass::runOnKernel(ir::IRKernel& k) { report("Running dead code elimination on kernel " << k.name); reportE(REPORT_PTX, k); Analysis* dfgAnalysis = getAnalysis(Analysis::DataflowGraphAnalysis); assert(dfgAnalysis != 0); analysis::DataflowGraph& dfg = *static_cast<analysis::DataflowGraph*>(dfgAnalysis); assert(dfg.ssa() != analysis::DataflowGraph::SsaType::None); BlockSet blocks; report(" Starting by scanning all basic blocks"); for(iterator block = dfg.begin(); block != dfg.end(); ++block) { report(" Queueing up BB_" << block->id()); blocks.insert(block); } while(!blocks.empty()) { iterator block = *blocks.begin(); blocks.erase(blocks.begin()); eliminateDeadInstructions(dfg, blocks, block); } report("Finished running dead code elimination on kernel " << k.name); reportE(REPORT_PTX, k); }
void Allocator::clearExpired(const Interval::Point &p) { SpillPolicy::CoalescedSet::iterator cr = _onRegister.begin(); SpillPolicy::CoalescedSet::iterator crEnd = _onRegister.end(); while(cr != crEnd) { if((*cr)->interval.end <= p) { reportE(DEBUG, "Point: "<< p << "; Coalesced register " << (*cr)->reg() << " expired at point (" << (*cr)->interval.end << ')'); while((*cr)->allocated.size() > 0) { reportE(DEBUG_DETAILS, "\tPoint: "<< p << "; Coalesced register was using physical register" << (*(*cr)->allocated.begin())); _available.insert(*(*cr)->allocated.begin()); _registerVariableMap.erase(*(*cr)->allocated.begin()); (*cr)->allocated.erase((*cr)->allocated.begin()); } SpillPolicy::CoalescedSet::iterator erase = cr; cr++; _onRegister.erase(erase); }else { cr++; } } }
void Allocator::setRegisters(const unsigned regs) { _regs = regs; reportE(INFO, "Setting number of physical registers to " << _regs); clear(); assertM(_regs > 0, "No physical registers"); }
void AffineLinearScan::_coalesce() { LinearScanRegisterAllocationPass::_coalesce(); RegisterCoalescedMap::iterator affReg = _ssa.begin(); while(affReg != _ssa.end()) { AffineRegister &ar = static_cast<AffineRegister &>(*affReg->second); reportE(DEBUG, "Coalesced: " << affReg->second); reportE(DEBUG, "Start: " << ar.state()); reportE(DEBUG, "In state: " << _afa().state(affReg->first)); ar.combineState(_afa().state(affReg->first)); reportE(DEBUG, "Out state: " << ar.state()); affReg++; } }
void DivergenceLinearScan::_coalesce() { LinearScanRegisterAllocationPass::_coalesce(); CoalescedRegisterMap::iterator divReg = _ssa.begin(); while(divReg != _ssa.end()) { DivergenceRegister &dr = static_cast<DivergenceRegister &>(*_coalesced[divReg->second]); reportE(DEBUG, "Coalesced: " << divReg->second); reportE(DEBUG, "Start: " << dr.state()); reportE(DEBUG, "In state: " << _diva().getDivergenceGraph().isDivNode(divReg->first)); dr.combineState(_diva().getDivergenceGraph().isDivNode(divReg->first)); reportE(DEBUG, "Out state: " << dr.state()); divReg++; } }
void Allocator::selectByLRU(SpillPolicy::CoalescedSet &used, const Interval::Point &p, unsigned &totalRequired){ LRUSpillPolicy lru; LRUSpillPolicy::CoalescedCostMap rank = lru.rank(used, p); reportE(INFO, "Discarding variables by access distance"); std::multimap<unsigned, CoalescedRegister*> costCoalescedMap; while(rank.size() > 0){ auto it = rank.begin(); costCoalescedMap.insert(std::make_pair(it->second, it->first)); rank.erase(it); } while(totalRequired > _regs){ assert(used.size() > 0); auto last = costCoalescedMap.end(); --last; totalRequired -= last->second->size(); reportE(DEBUG, "Discarding variable " << last->second->reg()); last->second->spill(); used.erase(last->second); costCoalescedMap.erase(last); } }
void ConstantPropagationPass::runOnKernel(ir::IRKernel& k) { report("Running constant propagation on kernel " << k.name); Analysis* dfgAnalysis = getAnalysis("DataflowGraphAnalysis"); assert(dfgAnalysis != 0); analysis::DataflowGraph& dfg = *static_cast<analysis::DataflowGraph*>(dfgAnalysis); dfg.convertToSSAType(analysis::DataflowGraph::Minimal); assert(dfg.ssa() == analysis::DataflowGraph::Minimal); BlockSet blocks; report(" Starting by scanning all basic blocks"); for(iterator block = dfg.begin(); block != dfg.end(); ++block) { report(" Queueing up BB_" << block->id()); blocks.insert(block); } while(!blocks.empty()) { iterator block = *blocks.begin(); blocks.erase(blocks.begin()); eliminateRedundantInstructions(dfg, blocks, block); } report("Finished running constant propagation on kernel " << k.name); reportE(REPORT_PTX, k); }
void DivergenceAnalysis::_analyzeDataFlow() { Analysis* dfg = getAnalysis("DataflowGraphAnalysis"); assert(dfg != 0); DataflowGraph &nonConstGraph = static_cast<DataflowGraph&>(*dfg); DataflowGraph::const_iterator block = nonConstGraph.begin(); DataflowGraph::const_iterator endBlock = nonConstGraph.end(); report("Analyzing data flow"); /* 1) Analyze the data flow adding divergence sources */ for (; block != endBlock; ++block) { report(" for block " << block->label()); DataflowGraph::PhiInstructionVector::const_iterator phiInstruction = block->phis().begin(); DataflowGraph::PhiInstructionVector::const_iterator endPhiInstruction = block->phis().end(); /* Go over the phi functions and add their dependences to the * dependence graph. */ for (; phiInstruction != endPhiInstruction; phiInstruction++) { for (DataflowGraph::RegisterVector::const_iterator si = phiInstruction->s.begin(); si != phiInstruction->s.end(); ++si) { _divergGraph.insertEdge(si->id, phiInstruction->d.id); report(" phi r" << phiInstruction->d.id << " <- r" << si->id); } } DataflowGraph::InstructionVector::const_iterator ii = block->instructions().begin(); DataflowGraph::InstructionVector::const_iterator iiEnd = block->instructions().end(); for (; ii != iiEnd; ++ii) { ir::PTXInstruction *ptxInstruction = NULL; bool atom = false; bool functionStackArgument = false; bool localMemoryOperand = false; bool isCall = false; std::set<const ir::PTXOperand*> divergenceSources; /* First we populate divergenceSources with all the * source operands that might diverge. */ if (typeid(ir::PTXInstruction) == typeid(*(ii->i))) { ptxInstruction = static_cast<ir::PTXInstruction*> (ii->i); if (isDivergenceSource(ptxInstruction->a)) { divergenceSources.insert(&ptxInstruction->a); } if (isDivergenceSource(ptxInstruction->b)) { divergenceSources.insert(&ptxInstruction->b); } if (isDivergenceSource(ptxInstruction->c)) { divergenceSources.insert(&ptxInstruction->c); } if (ptxInstruction->opcode == ir::PTXInstruction::Atom){ atom = true; } if (ptxInstruction->mayHaveAddressableOperand()) { if (_doesOperandUseLocalMemory(ptxInstruction->a)) { localMemoryOperand = true; } } if (ptxInstruction->opcode == ir::PTXInstruction::Call){ isCall = true; } } /* Second, if this is a function call, we populate divergenceSources * with all the source operands that might diverge in a call. */ if (_kernel->function()) { if (typeid(ir::PTXInstruction) == typeid(*(ii->i))) { ptxInstruction = static_cast<ir::PTXInstruction*> (ii->i); if (ptxInstruction->mayHaveAddressableOperand()) { if (_isOperandAnArgument(ptxInstruction->a)) { functionStackArgument = true; report(" operand '" << ptxInstruction->a.toString() << "' is a function call argument."); } } } } /* Third, we link the source operands to the * destination operands, and check if the destination * can diverge. This will only happen in case the * instruction is atomic. */ DataflowGraph::RegisterPointerVector::const_iterator destinationReg = ii->d.begin(); DataflowGraph::RegisterPointerVector::const_iterator destinationEndReg = ii->d.end(); for (; destinationReg != destinationEndReg; destinationReg++) { if (divergenceSources.size() != 0) { std::set<const ir::PTXOperand*>::iterator divergenceSource = divergenceSources.begin(); std::set<const ir::PTXOperand*>::iterator divergenceSourceEnd = divergenceSources.end(); for (; divergenceSource != divergenceSourceEnd; divergenceSource++) { report(" destination register r" << *destinationReg->pointer << " is derived from a divergence source r" << *divergenceSource); _divergGraph.insertEdge(*divergenceSource, *destinationReg->pointer); } } DataflowGraph::RegisterPointerVector::const_iterator sourceReg = ii->s.begin(); DataflowGraph::RegisterPointerVector::const_iterator sourceRegEnd = ii->s.end(); for (; sourceReg != sourceRegEnd; sourceReg++) { _divergGraph.insertEdge(*sourceReg->pointer, *destinationReg->pointer); reportE(REPORT_ALL_DEPENDENCES, " r" << *destinationReg->pointer << " <- r" << *sourceReg->pointer); } if (atom || functionStackArgument || localMemoryOperand || isCall) { report(" destination register r" << *destinationReg->pointer << " is a divergence source."); _divergGraph.insertNode(*destinationReg->pointer); _divergGraph.setAsDiv(*destinationReg->pointer); } } } } /* 2) Computes the divergence propagation */ _divergGraph.computeDivergence(); }
const SpillPolicy::CoalescedSet & Allocator::use(SpillPolicy::CoalescedSet used, const Interval::Point &p, const bool ignoreSpilled, const bool spillUsed, const bool coalesced, const bool removeSpilled) { assert((_available.size() + _registerVariableMap.size()) == _regs); reportE(INFO, "Point: "<< p << "; Variables count: " << used.size()); /* coalesced tells that all used variables must be placed on a coalesced amount of registers, * TODO: This is meant for vector variables, check if it is really required, and if it require * consequent registers * TODO: Spill/store/load vector variables all together using vector instructions */ assertM(!(coalesced && (spillUsed || ignoreSpilled)), "Can't be coalesced when ignoring spilled"); clearExpired(p); if(used.empty()){ reportE(INFO, "No variables used"); return _onRegister; } SpillPolicy::CoalescedSet::iterator crI = used.begin(); SpillPolicy::CoalescedSet::iterator crIEnd = used.end(); unsigned totalRegisters = 0; reportE(INFO, "Counting required registers"); while(crI != crIEnd) { CoalescedRegister *cr = *crI; reportE(DEBUG, "Variable: " << cr->reg() << "; Size: " << cr->size()); /* If spilled variables must be ignored, they are not allocated and removed from on register status */ if(ignoreSpilled && cr->spilled()) { reportE(DEBUG, "\tNot allocating spilled variable: " << cr->reg()); if(removeSpilled && (_onRegister.find(cr) != _onRegister.end())) { reportE(DEBUG, "\t\tSpilled variable " << cr->reg() << " marked as on-register, removing it"); while(cr->allocated.size() > 0) { reportE(DEBUG_DETAILS, "\t\t\tFreeing physical register " << *(cr->allocated.begin())); _available.insert(*(cr->allocated.begin())); _registerVariableMap.erase(_registerVariableMap.find(*(cr->allocated.begin()))); cr->allocated.erase(cr->allocated.begin()); } _onRegister.erase(cr); } SpillPolicy::CoalescedSet::iterator erase = crI; crI++; used.erase(erase); continue; } totalRegisters += cr->size(); crI++; } if(used.empty()) { reportE(DEBUG, "Nothing to do, all variables are spilled"); return _onRegister; } /* If we can't spill variables on the used list, they must use at most the same amount of registers * as available */ assertM((totalRegisters <= _regs) || spillUsed, "Variables larger than available registers"); if(totalRegisters > _regs){ selectByLRU(used, p, totalRegisters); } SpillPolicy::CoalescedSet worklist = used; /* If the variables need to be coalesced, then: * 1) All variables need to be the same size * 2) The first position will be aligned based on its size * This is expecting that vector require coalesced aligned registers */ reportE(INFO, "Required registers: " << totalRegisters); if(coalesced) {//TODO: Implement coalesced allocating, if required for vectors reportE(INFO, "Coalesced allocation"); assertM(false, "TODO: Implement coalesced allocating, required by vectors"); } SpillPolicy::CoalescedSet::iterator variable = worklist.begin(); while(variable != worklist.end()){ if(_onRegister.find(*variable) != _onRegister.end()){ reportE(DEBUG, "\tVariable " << (*variable)->reg() << " already on register, erasing it from worklist"); SpillPolicy::CoalescedSet::iterator erase = variable; variable++; worklist.erase(erase); continue; } variable++; } /* Must fit larger variables first --> map variables by registers size */ std::multimap<ushort, CoalescedRegister*> sizeCoalescedMap; for(SpillPolicy::CoalescedSet::const_iterator cr = worklist.begin(); cr != worklist.end(); cr++) { reportE(DEBUG_DETAILS, "\tVariable " << (*cr)->reg() << "; Size: " << (*cr)->size()); sizeCoalescedMap.insert(std::make_pair((*cr)->size(), *cr)); } reportE(INFO, "Start testing if all fits on current available registers"); /* Try to allocate variables by largest size first. If one won't fit, start spilling */ while(worklist.size() > 0) { bool fit = true; std::multimap<ushort, CoalescedRegister*>::iterator crI = --sizeCoalescedMap.end(); CoalescedRegister *cr = crI->second; reportE(DEBUG, "Test variable " << cr->reg() << " of size " << cr->size()); unsigned int maxReg = _regs - cr->size(); maxReg -= (maxReg % cr->size()); for(unsigned i = 0; i <= maxReg; i += cr->size()) { fit = true; reportE(DEBUG_DETAILS, "Starting position " << i); for(unsigned u = i; fit && (u < (i + cr->size())); u++) { fit &= (_available.find(u) != _available.end()); reportE(DEBUG_DETAILS, "Position " << u << " is busy?" << !fit); } if(fit) { reportE(DEBUG, "Allocating " << cr->reg() << " on position " << i); for(unsigned u = i; u < (i + cr->size()); u++) { cr->allocated.insert(u); assert(_available.erase(u)); _registerVariableMap[u] = cr; } sizeCoalescedMap.erase(crI); _onRegister.insert(cr); worklist.erase(cr); break; } } if(!fit){ break; } } if(worklist.empty()) { reportE(INFO, "All variable allocated without spilling"); return _onRegister; } /* Create variable spill cost ranking, based on active spill policies and weights */ PolicyMap::const_iterator policy = _policies.begin(); PolicyMap::const_iterator policyEnd = _policies.end(); RegisterCostMap scoresSum; for(SpillPolicy::RegisterId i = 0; i < _regs; i++) { scoresSum[i] = 0; } reportE(INFO, "Spilling is required"); for(; policy != policyEnd; policy++) { reportE(DEBUG, "Calculating spill cost based on spill policy: " << policy->first); SpillPolicy::CoalescedCostMap score; score = policy->second->rank(_onRegister, p); while(score.size() > 0) { CoalescedRegister* cr = score.begin()->first; assertM(cr->allocated.size() > 0, "Variable marked as on register and without allocated registers"); SpillPolicy::RegisterId reg = *cr->allocated.begin(); if(used.find(cr) != used.end()){ scoresSum[reg] = std::numeric_limits<unsigned>::max(); reportE(DEBUG_DETAILS, "Variable " << cr->reg() << " on position " << reg << " is used, setting cost to maximum"); } else { scoresSum[reg] += (_weights[policy->first] * score.begin()->second); reportE(DEBUG_DETAILS, "\tVariable " << cr->reg() << " on position " << reg << " has cost " << scoresSum[reg]); } score.erase(score.begin()); } } reportE(INFO, "Locating best locations based on spill costs"); /* Find best spilling position, starting by largest variables */ while(worklist.size() > 0) { std::multimap<ushort, CoalescedRegister*>::iterator crI = --sizeCoalescedMap.end(); CoalescedRegister *cr = crI->second; long int best = std::numeric_limits<long int>::max(); SpillPolicy::RegisterId bestStart = _regs; reportE(DEBUG, "\tAllocating variable " << cr->reg() << "; Size: " << cr->size()); unsigned int maxReg = _regs - cr->size(); maxReg -= (maxReg % cr->size()); for(SpillPolicy::RegisterId i = 0; i <= maxReg; i += cr->size()) { bool noUsedVariable = true; long int cost = 0; CoalescedRegister *last = NULL; reportE(DEBUG, "\t\tTesting start position:" << i); for(unsigned u = i; noUsedVariable && (u < (i + cr->size())); u++) { if(_registerVariableMap.find(u) == _registerVariableMap.end()){ reportE(DEBUG_DETAILS, "\t\t\tNo variable starts on position " << u); continue; } if(last == _registerVariableMap[u]){ reportE(DEBUG_DETAILS, "\t\t\tPosition " << u << " is part of already accounted variable " << last->reg()); continue; } last = _registerVariableMap[u]; /* Assure that the variable using register is not being used now */ reportE(DEBUG_DETAILS, "\t\t\tPosition " << u << " is beginning of variable " << last->reg()); noUsedVariable &= (used.find(last) == used.end()); if(noUsedVariable){ reportE(DEBUG_DETAILS, "\t\t\tVariable " << last->reg() << " is not required to be allocated."); } else { reportE(DEBUG_DETAILS, "\t\t\tVariable " << last->reg() << " is required to be allocated."); } cost += scoresSum[*last->allocated.begin()]; reportE(DEBUG, "\t\tCost to insert at point " << i << ": " << cost); } if(!noUsedVariable) { reportE(DEBUG, "\t\tCan't use position " << i << ", variable required to be on register"); continue; } if(cost < best) { best = cost; bestStart = i; reportE(INFO, "\t\tNew best position for variable " << cr->reg() << ": " << i); } } assertM(bestStart != _regs, "Error finding a insertion position"); reportE(INFO, "\tAllocating at position " << bestStart); for(unsigned u = bestStart; u < (bestStart + cr->size()); u++) { if(_registerVariableMap.find(u) == _registerVariableMap.end()) { reportE(DEBUG_DETAILS, "\tNo variable at position " << u); continue; } CoalescedRegister *spillCr = _registerVariableMap[u]; if(!spillCr->spilled()) { reportE(DEBUG, "\tSpilling variable " << spillCr->reg()); spillCr->spill(); } _onRegister.erase(spillCr); while(spillCr->allocated.size() > 0) { reportE(DEBUG_DETAILS, "\tFreeing physical register " << *(spillCr->allocated.begin())); _available.insert(*(spillCr->allocated.begin())); _registerVariableMap.erase( _registerVariableMap.find(*(spillCr->allocated.begin()))); spillCr->allocated.erase(spillCr->allocated.begin()); } } _onRegister.insert(cr); for(unsigned u = bestStart; u < (bestStart + (cr->size())); u++) { reportE(DEBUG, "\tAllocating register " << u << " to variable " << cr->reg()); cr->allocated.insert(u); _registerVariableMap[u] = cr; assertM(_available.erase(u), "Register being allocated not marked as available"); } worklist.erase(cr); sizeCoalescedMap.erase(crI); ; } assertM(sizeCoalescedMap.empty(), "Size mapped worklist not clear"); return _onRegister; }
const SpillPolicy::CoalescedSet & Allocator::use(CoalescedRegister *cr, const Interval::Point &p) { assert((_available.size() + _registerVariableMap.size()) == _regs); clearExpired(p); reportE(INFO, "Point: "<< p << "; Variable " << cr->reg() << "; Size: " << cr->size()); /* If the variable cr is marked as on register, there is nothing to do */ if(_onRegister.find(cr) != _onRegister.end()) { reportE(DEBUG, "Point: "<< p << "; Variable " << cr->reg() << " already marked as onRegister, nothing to do"); return _onRegister; } /* There can't be a variable larger than the available registers */ assertM(cr->size() <= _regs, "Variable larger than available registers"); /* Test if the variable fits on available holes. We assume the variable must * be placed on a register position aligned to it's size */ //TODO: Discover if vectors require aligned positions and how may affect here unsigned int maxReg = _regs - cr->size(); maxReg -= (maxReg % cr->size()); for(unsigned i = 0; i <= maxReg; i += cr->size()) { bool fit = true; for(unsigned u = i; fit && (u < (i + cr->size())); u++) { fit &= _available.find(u) != _available.end(); } if(fit) { reportE(DEBUG, "Point: "<< p << "; Variable " << cr->reg() << ", with size " << cr->size() << " fits on starting register " << i); for(unsigned u = i; fit && (u < (i + cr->size())); u++) { reportE(DEBUG_DETAILS, "\tPoint: "<< p << "; Variable " << cr->reg() << " receives physical register " << u); cr->allocated.insert(u); _available.erase(u); _registerVariableMap[u] = cr; } _onRegister.insert(cr); return _onRegister; } } /* Create a cost ranking based on all spill policies active, and weight of each one of them */ PolicyMap::const_iterator policy = _policies.begin(); PolicyMap::const_iterator policyEnd = _policies.end(); RegisterCostMap scoresSum; for(SpillPolicy::RegisterId i = 0; i < _regs; i++) { scoresSum[i] = 0; } for(; policy != policyEnd; policy++) { reportE(DEBUG, "Point: "<< p << "; Building spilling cost score for spill policy" << policy->first); SpillPolicy::CoalescedCostMap score; score = policy->second->rank(_onRegister, p); while(score.size() > 0) { assertM(score.begin()->first->allocated.size() > 0, "No allocated physical registers"); SpillPolicy::RegisterId reg = *score.begin()->first->allocated.begin(); reportE(DEBUG_DETAILS, "\tPoint: "<< p << "; Variable " << score.begin()->first->reg() << ", at register " << reg << " receives cost " << score.begin()->second); scoresSum[reg] += (_weights[policy->first] * score.begin()->second); reportE(DEBUG_DETAILS, "\tPoint: "<< p << "; Register" << reg << " has total cost " << scoresSum[reg]); score.erase(score.begin()); } } /* Locate the best insertion point */ long int best = std::numeric_limits<long int>::max(); SpillPolicy::RegisterId bestStart = _regs; for(SpillPolicy::RegisterId i = 0; i <= maxReg; i += cr->size()) { long int cost = 0; CoalescedRegister* last = NULL; for(unsigned u = i; (u < (i + cr->size())); u++) { if(_registerVariableMap.find(u) == _registerVariableMap.end()) continue; if(last == _registerVariableMap[u]) continue; last = _registerVariableMap[u]; cost += scoresSum[*last->allocated.begin()]; } reportE(DEBUG_DETAILS, "\tPoint: "<< p << "; Spilling from position " << i << ", a size of " << cr->size() << " registers has a cost of " << cost); if(cost < best) { best = cost; bestStart = i; reportE(DEBUG_DETAILS, "\tPoint: "<< p << "; Is best spilling cost so far" ); } } assertM(bestStart < _regs, "Error finding best insert position"); /* Spill required variables */ reportE(DEBUG, "Spilling from start position " << bestStart ); for(unsigned u = bestStart; (u < (bestStart + cr->size())); u++) { reportE(DEBUG_DETAILS, "\tRegister "<< u << ":" ); if(_registerVariableMap.find(u) == _registerVariableMap.end()) { assert(_available.find(u) != _available.end()); continue; } CoalescedRegister *spillCr = _registerVariableMap[u]; reportE(DEBUG_DETAILS, "\t\tContains variable "<< spillCr->reg() << " of size " << spillCr->size() ); if(!spillCr->spilled()) { reportE(DEBUG_DETAILS, "\t\t\tNew spill" ); spillCr->spill(); } assertM((*(spillCr->allocated.begin())) == u, "Wrong mapping"); while(spillCr->allocated.size() > 0) { SpillPolicy::RegisterId a = *(spillCr->allocated.begin()); _available.insert(a); reportE(DEBUG_DETAILS, "\t\t\tFreeing register " << a ); _registerVariableMap.erase(a); spillCr->allocated.erase(a); } _onRegister.erase(spillCr); } reportE(DEBUG, "Associating registers to variable " << cr->reg() ); /* Allocate registers to the new variable */ _onRegister.insert(cr); for(unsigned u = bestStart; (u < (bestStart + cr->size())); u++) { cr->allocated.insert(u); assertM(_available.erase(u), "Register being allocated not marked as available"); _registerVariableMap[u] = cr; } return _onRegister; }
static void convertParametersToRegisters( const BasicBlockMap& newBlocks, ir::IRKernel& kernel, ir::ControlFlowGraph::instruction_iterator callIterator, const ir::IRKernel& calledKernel) { typedef std::unordered_map<std::string, ir::PTXOperand> OperandMap; typedef std::unordered_set<std::string> StringSet; reportE(REPORT_DETAILS, " Converting parameters to registers..."); // Get a map from argument name to register in the calling function OperandMap argumentMap; StringSet bitBucketArguments; auto argument = calledKernel.arguments.begin(); ir::PTXInstruction& call = static_cast<ir::PTXInstruction&>(**callIterator); for(auto parameter = call.d.array.begin(); parameter != call.d.array.end(); ++parameter, ++argument) { if(parameter->addressMode == ir::PTXOperand::BitBucket) { bitBucketArguments.insert(argument->name); continue; } assert(argument != calledKernel.arguments.end()); assert(parameter->addressMode == ir::PTXOperand::Register || parameter->addressMode == ir::PTXOperand::Immediate); assert(argumentMap.count(argument->name) == 0); assert(argument->returnArgument); argumentMap.insert(std::make_pair(argument->name, *parameter)); } for(auto parameter = call.b.array.begin(); parameter != call.b.array.end(); ++parameter, ++argument) { if(parameter->addressMode == ir::PTXOperand::BitBucket) { bitBucketArguments.insert(argument->name); continue; } assert(argument != calledKernel.arguments.end()); assert(parameter->addressMode == ir::PTXOperand::Register || parameter->addressMode == ir::PTXOperand::Immediate); assert(argumentMap.count(argument->name) == 0); assert(!argument->returnArgument); argumentMap.insert(std::make_pair(argument->name, *parameter)); } // Convert all stores to that parameter to moves to the associated register for(auto block = newBlocks.begin(); block != newBlocks.end(); ++block) { for(auto instruction = block->second->instructions.begin(); instruction != block->second->instructions.end(); ++instruction) { ir::PTXInstruction& ptx = static_cast<ir::PTXInstruction&>( **instruction); if(ptx.opcode != ir::PTXInstruction::St) continue; if(ptx.addressSpace != ir::PTXInstruction::Param) continue; if(ptx.d.addressMode != ir::PTXOperand::Address) continue; if(bitBucketArguments.count(ptx.d.identifier)) { delete *instruction; instruction = --block->second->instructions.erase(instruction); continue; } auto argument = argumentMap.find(ptx.d.identifier); if(argument == argumentMap.end()) continue; ptx.type = argument->second.type; ptx.pg = call.pg; ptx.d = argument->second; if(argument->second.addressMode == ir::PTXOperand::Register) { // If the types match, it is a move if(argument->second.type == ptx.d.type) { ptx.opcode = ir::PTXInstruction::Mov; } else { // otherwise, we need a cast ptx.opcode = ir::PTXInstruction::Cvt; ptx.modifier = ir::PTXInstruction::Modifier_invalid; } } else { assert(argument->second.addressMode == ir::PTXOperand::Immediate); ptx.opcode = ir::PTXInstruction::Mov; } } } // Convert all loads from that parameter to moves from the register for(auto block = newBlocks.begin(); block != newBlocks.end(); ++block) { for(auto instruction = block->second->instructions.begin(); instruction != block->second->instructions.end(); ++instruction) { ir::PTXInstruction& ptx = static_cast<ir::PTXInstruction&>( **instruction); if(ptx.opcode != ir::PTXInstruction::Ld) continue; if(ptx.addressSpace != ir::PTXInstruction::Param) continue; if(ptx.a.addressMode != ir::PTXOperand::Address) continue; if(bitBucketArguments.count(ptx.a.identifier)) { delete *instruction; instruction = --block->second->instructions.erase(instruction); continue; } auto argument = argumentMap.find(ptx.a.identifier); if(argument == argumentMap.end()) continue; assert(ptx.d.addressMode == ir::PTXOperand::Register); ptx.type = argument->second.type; ptx.pg = call.pg; ptx.a = argument->second; // If the types match, it is a move if(ptx.type == ptx.a.type) { ptx.opcode = ir::PTXInstruction::Mov; } else { // otherwise, we need a cast ptx.opcode = ir::PTXInstruction::Cvt; ptx.modifier = ir::PTXInstruction::Modifier_invalid; } } } }
void AffineLinearScan::_extendStack() { _shared.declaration(_kernel->locals, MAX_WARPS); reportE(INFO, "Kernel " << _kernel->name << " requires " << _shared.bytes() << " bytes of shared memory per warp, total of " << MAX_WARPS * _shared.bytes() << '(' << MAX_WARPS << " warps)"); LinearScanRegisterAllocationPass::_extendStack(); reportE(DEBUG, "Writing warp local memory stack access information"); if(_shared.bytes() == 0) return; /* warpid = (size_x * ( size_y * z + y ) + x) >> 5 * a = size_y * b = z * c = y * a = mad a z c * b = size_x * c = x * a = mad a b c * a = shr a 5 (>>5 == /32) * memPosition = memInitialPosition [ warpid * bytesPerWarp ] */ analysis::DataflowGraph::iterator block = _dfg().begin(); RegisterId a, b, c; /* Use a AffineRegister temporary register of type u32 if available */ if(AffineRegister::tempRegisters.count(ir::PTXOperand::DataType::u32) != 0) { a = AffineRegister::tempRegisters[ir::PTXOperand::DataType::u32]; } else { a = _dfg().newRegister(); } b = _dfg().newRegister(); /* If memory size is 32 bits, can use warpPosition variable as temporary */ if(_m->addressSize() == 32) { c = AffineRegister::warpPosition; } else { c = _dfg().newRegister(); } // size_y = %ntid.y ir::PTXInstruction sizeY(ir::PTXInstruction::Mov); sizeY.d = ir::PTXOperand(ir::PTXOperand::Register, ir::PTXOperand::DataType::u32, a); sizeY.a = ir::PTXOperand(ir::PTXOperand::ntid, ir::PTXOperand::iy, ir::PTXOperand::u32); sizeY.type = ir::PTXOperand::DataType::u32; _dfg().insert(block, sizeY, 0); // z = %tid.z ir::PTXInstruction z(ir::PTXInstruction::Mov); z.d = ir::PTXOperand(ir::PTXOperand::Register, ir::PTXOperand::DataType::u32, b); z.a = ir::PTXOperand(ir::PTXOperand::tid, ir::PTXOperand::iz, ir::PTXOperand::u32); z.type = ir::PTXOperand::DataType::u32; _dfg().insert(block, z, 1); // y = %tid.y ir::PTXInstruction y(ir::PTXInstruction::Mov); y.d = ir::PTXOperand(ir::PTXOperand::Register, ir::PTXOperand::DataType::u32, c); y.a = ir::PTXOperand(ir::PTXOperand::tid, ir::PTXOperand::iy, ir::PTXOperand::u32); y.type = ir::PTXOperand::DataType::u32; _dfg().insert(block, y, 2); ir::PTXInstruction mad1(ir::PTXInstruction::Mad); mad1.d = sizeY.d; mad1.a = sizeY.d; mad1.b = z.d; mad1.c = y.d; mad1.type = ir::PTXOperand::DataType::u32; mad1.modifier = ir::PTXInstruction::Modifier::lo; _dfg().insert(block, mad1, 3); // size_x = %ntid.x ir::PTXInstruction sizeX(ir::PTXInstruction::Mov); sizeX.d = z.d; sizeX.a = ir::PTXOperand(ir::PTXOperand::ntid, ir::PTXOperand::ix, ir::PTXOperand::u32); sizeX.type = ir::PTXOperand::DataType::u32; _dfg().insert(block, sizeX, 4); // x = %tid.x ir::PTXInstruction x(ir::PTXInstruction::Mov); x.d = y.d; x.a = ir::PTXOperand(ir::PTXOperand::tid, ir::PTXOperand::ix, ir::PTXOperand::u32); x.type = ir::PTXOperand::DataType::u32; _dfg().insert(block, x, 5); // 1) warpid = size_x * size_y ir::PTXInstruction mad2(ir::PTXInstruction::Mad); mad2.d = mad1.d; mad2.a = mad1.d; mad2.b = sizeX.d; mad2.c = x.d; mad2.type = ir::PTXOperand::DataType::u32; mad2.modifier = ir::PTXInstruction::Modifier::lo; _dfg().insert(block, mad2, 6); // 5) warpid = [size_x * y + size_x * size_y * z + x] >> 5 ir::PTXInstruction shr(ir::PTXInstruction::Shr); shr.d = mad2.d; shr.a = mad2.d; shr.b = ir::PTXOperand(5); shr.type = ir::PTXOperand::DataType::u32; _dfg().insert(block, shr, 7); // 6) position = warpid * stride ir::PTXInstruction position(ir::PTXInstruction::Mul); position.d = shr.d; position.a = shr.d; position.b = ir::PTXOperand(_shared.bytes()); position.type = ir::PTXOperand::DataType::u32; position.modifier = ir::PTXInstruction::Modifier::lo; _dfg().insert(block, position, 8); //%memoryStart = stack name; ir::PTXInstruction memoryStart(ir::PTXInstruction::Mov); memoryStart.a = ir::PTXOperand(_shared.name() + "[" + position.d.toString() + "]"); if(_m->addressSize() == 32) { memoryStart.d = x.d; memoryStart.type = ir::PTXOperand::DataType::u32; } else { memoryStart.d = ir::PTXOperand(ir::PTXOperand::Register, ir::PTXOperand::DataType::u64, AffineRegister::warpPosition); memoryStart.type = ir::PTXOperand::DataType::u64; } _dfg().insert(block, memoryStart, 9); }
void AffineLinearScan::finalize() { _clear(); reportE(DEBUG, "Finalizing affine linear scan"); }
void AffineLinearScan::initialize(const ir::Module& m) { reportE(DEBUG, "Running affine linear scan"); _m = &m; }
void DivergenceLinearScan::finalize() { _clear(); reportE(DEBUG, "Finalizing divergence linear scan"); }
void DivergenceLinearScan::initialize(const ir::Module& m) { reportE(DEBUG, "Running divergence linear scan"); _m = &m; }