static BlockSet getBlocksWithCallsToFuctionsThatObserveSideEffects(
	ir::IRKernel& k)
{
	BlockSet blocks;

	report(" Getting functions that can observe side-effects");
	
	for(auto block = k.cfg()->begin(); block != k.cfg()->end(); ++block)
	{
		for(auto instruction : block->instructions)
		{
			auto ptxInstruction = static_cast<ir::PTXInstruction*>(instruction);
		
			// TODO: Check that the target can observe side effects
			if(ptxInstruction->isCall())
			{
				report("  " << ptxInstruction->toString());

				blocks.insert(block);
				break;
			}
		}
	}
	
	return blocks;
}
ir::ControlFlowGraph::iterator 
	HoistParameterLoadsPass::_getTopLevelDominatingBlock(
		ir::IRKernel& k, ir::ControlFlowGraph::iterator block)
{
	auto loopAnalysis = static_cast<analysis::LoopAnalysis*>(
		getAnalysis(Analysis::LoopAnalysis));
	auto dominatorTree = static_cast<analysis::DominatorTree*>(
		getAnalysis(Analysis::DominatorTreeAnalysis));
		
	while(loopAnalysis->isContainedInLoop(block))
	{
		auto dominator = dominatorTree->getDominator(block);
		
		if(dominator == block->cfg->get_entry_block())
		{
			block = k.cfg()->split_edge(dominator->get_fallthrough_edge(),
				ir::BasicBlock(k.cfg()->newId())).first->tail;
				
			invalidateAnalysis(analysis::Analysis::LoopAnalysis         );
			invalidateAnalysis(analysis::Analysis::DominatorTreeAnalysis);

			break;
		}
		
		block = dominator;
	}
	
	return block;
	
}
void SimplifyControlFlowGraphPass::runOnKernel(ir::IRKernel& k)
{
	bool changed = true;
	
	report("Simplify control flow for " << k.name);
	
	while(changed)
	{
		changed  = _deleteUnconnectedBlocks(k);
		changed |= _deleteEmptyBlocks(k);
		
		#if REPORT_BASE > 1
		k.cfg()->write(std::cout);
		#endif

		changed |= _simplifyTerminator(k);
		
		#if REPORT_BASE > 1
		k.cfg()->write(std::cout);
		#endif
		
		changed |= _mergeBlockIntoPredecessor(k);
		
		#if REPORT_BASE > 1
		k.cfg()->write(std::cout);
		#endif
		
		changed |= _mergeExitBlocks(k);
		
		#if REPORT_BASE > 1
		k.cfg()->write(std::cout);
		#endif
	}
}
bool SimplifyControlFlowGraphPass::_deleteEmptyBlocks(ir::IRKernel& k)
{
	report(" Deleting empty blocks...");
	
	bool any = false;
	
	for(ir::ControlFlowGraph::iterator block = k.cfg()->begin();
		block != k.cfg()->end(); )
	{
		if(block == k.cfg()->get_entry_block()) { ++block; continue; }
		if(block == k.cfg()->get_exit_block())  { ++block; continue; }
	
		if(block->instructions.empty())
		{
			// redirect all in_edges to the target
			ir::BasicBlock::EdgePointerVector inEdges = block->in_edges;
		
			if(block->has_fallthrough_edge())
			{
				ir::ControlFlowGraph::iterator fallthrough =
					block->get_fallthrough_edge()->tail;

				k.cfg()->remove_edge(block->get_fallthrough_edge());

				for(ir::ControlFlowGraph::edge_pointer_iterator
					edge = inEdges.begin();	edge != inEdges.end(); ++edge)
				{
					if((*edge)->type == ir::Edge::FallThrough)
					{
						k.cfg()->insert_edge(ir::Edge((*edge)->head,
							fallthrough, ir::Edge::FallThrough));
					}
					else
					{
						k.cfg()->insert_edge(ir::Edge((*edge)->head,
							fallthrough, ir::Edge::Branch));
							
						ir::PTXInstruction& ptx =
							static_cast<ir::PTXInstruction&>(
							*(*edge)->head->instructions.back());
							
						ptx.d.identifier = fallthrough->label();
					}
				}
			}
		
			report("  " << block->label());
		
			// delete the block, should wipe out all edges
			k.cfg()->remove_block(block++);
		}
		else
		{
			++block;
		}
	}
	
	return any;
}
Exemple #5
0
/*! \brief Analyze the control and data flow graphs searching for divergent 
 *    variables and blocks
 *
 * 1) Makes data flow analysis that detects divergent variables and blocks 
 *    based on divergent sources, such as t.id, laneId
 * 2) Makes control flow analysis that detects new divergent variables based
 *    on the dependency of variables of variables created on divergent paths
 */
void DivergenceAnalysis::analyze(ir::IRKernel &k)
{
	Analysis* dfgAnalysis = getAnalysis("DataflowGraphAnalysis");
	assert(dfgAnalysis != 0);

	DataflowGraph &dfg = static_cast<DataflowGraph&>(*dfgAnalysis);

	dfg.convertToSSAType(DataflowGraph::Gated);

	assert(dfg.ssa());

	_divergGraph.clear();
	_notDivergentBlocks.clear();
	_kernel = &k;

	report("Running Divergence analysis on kernel '" << k.name << "'")
	#if REPORT_PTX > 0
	k.write(std::cout);
	#endif

	DivergenceGraph::node_set predicates;
		
	/* 1) Makes data flow analysis that detects divergent variables and blocks
		based on divergent sources, such as t.id, laneId */
	_analyzeDataFlow();
	/* 2) Makes control flow analysis that detects new divergent variables
		based on the dependency of variables of variables created on divergent
		paths */
	_analyzeControlFlow();
}
void AddLocationMetadataPass::runOnKernel(ir::IRKernel& k)
{
	report("Adding location meta data to kernel " << k.name);

	for(auto block = k.cfg()->begin(); block != k.cfg()->end(); ++block)
	{
		if(block->instructions.empty()) continue;
		
		ir::PTXInstruction& ptx = static_cast<ir::PTXInstruction&>(
			*block->instructions.front());
	
		ptx.metadata += " .location '" + k.getLocationString(ptx) + "'";
	
		report(" - " << ptx.toString() << " - " << ptx.metadata);
	}
}
void HoistParameterLoadsPass::runOnKernel(ir::IRKernel& k)
{
	typedef std::pair<ir::ControlFlowGraph::iterator, ir::PTXInstruction*> Load;
	typedef std::vector<Load> LoadVector;

	auto aliasAnalysis = static_cast<analysis::SimpleAliasAnalysis*>(
		getAnalysis(Analysis::SimpleAliasAnalysis));
	
	LoadVector candidateLoads;
	
	report("Hoisting loads in kernel '" << k.name << "'");

	report(" Identifying candidate loads");
	
	for(auto block = k.cfg()->begin(); block != k.cfg()->end(); ++block)
	{
		for(auto instruction = block->instructions.begin();
			instruction != block->instructions.end(); ++instruction)
		{
			auto ptx = static_cast<ir::PTXInstruction*>(*instruction);
		
			if(ptx->isLoad() &&
				aliasAnalysis->cannotAliasAnyStore(*instruction)
				&& hasNoRegisterDependencies(*ptx))
			{
				report("  " << ptx->toString());
				candidateLoads.push_back(std::make_pair(block, ptx));
			}
		}
	}
	
	report(" Attempting to hoist loads");
	for(auto load = candidateLoads.begin();
		load != candidateLoads.end(); ++load)
	{
		_tryHoistingLoad(load->first, load->second, k);
	}
	
	invalidateAnalysis(analysis::Analysis::DataflowGraphAnalysis);
	invalidateAnalysis(analysis::Analysis::SimpleAliasAnalysis  );
}
static void mergeBlocks(ir::IRKernel& k,
	ir::ControlFlowGraph::iterator predecessor,
	ir::ControlFlowGraph::iterator block)
{
	typedef std::vector<ir::Edge> EdgeVector;

	// delete the branch at the end of the predecessor
	auto branch = getBranch(predecessor);
	
	if(branch != 0)
	{
		delete branch;
		predecessor->instructions.pop_back();
	}
	
	// move remaining instructions intro the predecessor
	predecessor->instructions.insert(predecessor->instructions.end(),
		block->instructions.begin(), block->instructions.end());
	
	block->instructions.clear();

	// track the block's out edges
	EdgeVector outEdges;
	
	for(auto edge = block->out_edges.begin();
		edge != block->out_edges.end(); ++edge)
	{
		outEdges.push_back(**edge);
	}	
	
	// remove the block
	k.cfg()->remove_block(block);

	// add the edges back in
	for(auto edge = outEdges.begin(); edge != outEdges.end(); ++edge)
	{
		k.cfg()->insert_edge(ir::Edge(predecessor, edge->tail, edge->type));
	}
}
void SimpleAliasAnalysis::analyze(ir::IRKernel& kernel)
{
	// Functions can be called
	_aStoreCanReachThisFunction = !kernel.function();
	_kernel = &kernel;
	
	if(_aStoreCanReachThisFunction) return;
	
	for(auto block = kernel.cfg()->begin();
		block != kernel.cfg()->end(); ++block)
	{
		for(auto instruction = block->instructions.begin();
			instruction != block->instructions.end(); ++instruction)
		{
			auto ptx = static_cast<ir::PTXInstruction*>(*instruction);
			
			if(ptx->isStore())
			{
				_aStoreCanReachThisFunction = true;
				return;
			}
		}
	}
}
bool SimplifyControlFlowGraphPass::_mergeBlockIntoPredecessor(ir::IRKernel& k)
{
	bool merged = false;
	
	report(" Merging blocks with predecessors...");
	
	for(ir::ControlFlowGraph::iterator block = k.cfg()->begin();
		block != k.cfg()->end(); )
	{
		if(block == k.cfg()->get_entry_block()) { ++block; continue; }
		if(block == k.cfg()->get_exit_block())  { ++block; continue; }
	
		// Block has a single predecessor
		bool singlePredecessor = block->in_edges.size() == 1;
	
		if(!singlePredecessor) { ++block; continue; }
	
		// Predecessor has single successor
		auto predecessor = block->in_edges.back()->head;
		
		if(predecessor == k.cfg()->get_entry_block()) { ++block; continue; }
		
		bool singleSuccessor = predecessor->out_edges.size() == 1;
		
		if(!singleSuccessor) { ++block; continue; }
		
		report("  " << predecessor->label() << " <- " << block->label());
		
		// Merge the blocks
		mergeBlocks(k, predecessor, block++);
		
		merged = true;
	}
	
	return merged;
}
void DivergenceLinearScan::runOnKernel(ir::IRKernel& k)
{
	auto dfg = static_cast<analysis::DataflowGraph*>(
		getAnalysis("DataflowGraphAnalysis"));
	
	dfg->convertToSSAType(analysis::DataflowGraph::Gated);

#if DIVERGENCE_REGISTER_PROFILE_H_
	divergenceProfiler::resetSpillData();
#endif

	_shared.clear();
	LinearScanRegisterAllocationPass::runOnKernel(k);
#if DIVERGENCE_REGISTER_PROFILE_H_
	if(!k.function())
		divergenceProfiler::printSpillResults(k.name);
#endif
}
void AffineLinearScan::runOnKernel(ir::IRKernel& k)
{
	#if AFFINE_REGISTER_PROFILE_H_
	affineProfiler::resetSpillData();
	#endif
	
	AffineRegister::tempRegisters.clear();
	AffineRegister::warpPosition = 0;
	_shared.clear();
	
	LinearScanRegisterAllocationPass::runOnKernel(k);
	
	#if AFFINE_REGISTER_PROFILE_H_
	if(!k.function())
	{
		affineProfiler::printSpillResults(k.name);
	}
	#endif
}
void PostdominatorTree::analyze(ir::IRKernel& kernel) {
    // form a vector of the basic blocks in post-order
    report("Building post-dominator tree.");
    cfg = kernel.cfg();
    report(" Starting with post order sequence");
    // form a vector of the basic blocks in post-order
    ir::ControlFlowGraph::BlockPointerVector
    post_order = cfg->reverse_topological_sequence();

    ir::ControlFlowGraph::reverse_pointer_iterator it = post_order.rbegin();
    ir::ControlFlowGraph::reverse_pointer_iterator end = post_order.rend();
    for (; it != end; ++it) {
        blocks.push_back(*it);
        blocksToIndex[*it] = (int)blocks.size()-1;
        p_dom.push_back(-1);
        report("  " << (*it)->label());
    }

    computeDT();
}
static InstructionSet getInstructionsThatCanObserveSideEffects(ir::IRKernel& k)
{
	InstructionSet instructions;
	
	report(" Getting instructions that can observe side-effects");
	
	for(auto& block : *k.cfg())
	{
		for(auto instruction : block.instructions)
		{
			auto ptxInstruction = static_cast<ir::PTXInstruction*>(instruction);
		
			if(ptxInstruction->canObserveSideEffects())
			{
				report("  " << ptxInstruction->toString());
				
				instructions.insert(ptxInstruction);
			}
		}
	}
	
	return instructions;
}
void ThreadFrontierReconvergencePass::runOnKernel(const ir::IRKernel& k)
{
	report("Running thread frontier reconvergence pass");
	typedef analysis::ThreadFrontierAnalysis::Priority Priority;
	typedef std::multimap<Priority, ir::ControlFlowGraph::const_iterator,
		std::greater<Priority>> ReversePriorityMap;
	typedef analysis::ThreadFrontierAnalysis TFAnalysis;
	typedef ir::ControlFlowGraph::const_pointer_iterator const_pointer_iterator;

	Analysis* analysis = getAnalysis(Analysis::ThreadFrontierAnalysis);
	assert(analysis != 0);
	
	TFAnalysis* tfAnalysis = static_cast<TFAnalysis*>(analysis);

	ReversePriorityMap priorityToBlocks;
	
	// sort by priority (high to low)
	for(ir::ControlFlowGraph::const_iterator block = k.cfg()->begin();
		block != k.cfg()->end(); ++block)
	{
		priorityToBlocks.insert(std::make_pair(tfAnalysis->getPriority(block),
			block));
	}
	
	typedef std::unordered_map<ir::BasicBlock::Id, unsigned int> IdToPCMap;
	typedef std::unordered_map<unsigned int,
		ir::ControlFlowGraph::const_iterator> PCToBlockMap;

	IdToPCMap     pcs;
	PCToBlockMap  branchPCs;
	PCToBlockMap  fallthroughPCs;
	
	// lay the code out in priority order
	report(" Packing instructions into a vector");
	for(ReversePriorityMap::const_iterator
		priorityAndBlock = priorityToBlocks.begin();
		priorityAndBlock != priorityToBlocks.end(); ++priorityAndBlock)
	{
		ir::ControlFlowGraph::const_iterator block = priorityAndBlock->second;

		report("  Basic Block " << block->label() << " ("
			<< block->id << ")");
			
		pcs.insert(std::make_pair(block->id, instructions.size()));
		
		for(ir::ControlFlowGraph::InstructionList::const_iterator 
			instruction = block->instructions.begin();
			instruction != block->instructions.end(); ++instruction)
		{
			const ir::PTXInstruction& ptx = static_cast<
				const ir::PTXInstruction&>(**instruction);
				
			report("   [" << instructions.size() << "] '" << ptx.toString());
			
			instructions.push_back(ptx);
			instructions.back().pc = instructions.size() - 1;
			
			if(ptx.opcode == ir::PTXInstruction::Bra)
			{
				branchPCs.insert(std::make_pair(instructions.back().pc, block));
			}
		}
		
		if(!_gen6)
		{
			// Add a branch for the fallthrough if it is in the TF
			if(block->has_fallthrough_edge())
			{
				ir::ControlFlowGraph::const_iterator target =
					block->get_fallthrough_edge()->tail;
				
				ReversePriorityMap::const_iterator next = priorityAndBlock;
				++next;
				
				bool needsCheck = target != next->second;
				
				TFAnalysis::BlockVector frontier =
						tfAnalysis->getThreadFrontier(block);
		
				for(TFAnalysis::BlockVector::const_iterator
					stalledBlock = frontier.begin();
					stalledBlock != frontier.end(); ++stalledBlock)
				{
					if((*stalledBlock)->id == target->id)
					{
						needsCheck = true;
						break;
					}
				}
				
				if(needsCheck)
				{
					fallthroughPCs.insert(std::make_pair(
						instructions.size(), block));
					
					instructions.push_back(ir::PTXInstruction(
						ir::PTXInstruction::Bra,
						ir::PTXOperand(target->label())));
		
					instructions.back().needsReconvergenceCheck = true;
					instructions.back().branchTargetInstruction = -1;
					report("   [" << (instructions.size() - 1) << "] '"
						<< instructions.back().toString());
					report("    - artificial branch for check on"
						" fallthrough into TF.");
				}
			}
		}
	}
	
	report(" Updating branch targets");
	for(PCToBlockMap::const_iterator pcAndBlock = branchPCs.begin();
		pcAndBlock != branchPCs.end(); ++pcAndBlock)
	{
		ir::ControlFlowGraph::const_iterator block = pcAndBlock->second;
		unsigned int pc                            = pcAndBlock->first;
		
		const ir::PTXInstruction& ptx = static_cast<
			const ir::PTXInstruction&>(instructions[pc]);
			
		ir::ControlFlowGraph::const_iterator target =
			block->get_branch_edge()->tail;
					
		IdToPCMap::const_iterator targetPC = pcs.find(target->id);
		assert(targetPC != pcs.end());
		
		report("  setting branch target of '" << ptx.toString()
			<< "' to " << targetPC->second);
		
		instructions[pc].branchTargetInstruction = targetPC->second;
		
		TFAnalysis::BlockVector frontier =
			tfAnalysis->getThreadFrontier(block);
		
		if(_gen6)
		{
			ir::ControlFlowGraph::const_iterator firstBlock =
				k.cfg()->end();
		
			TFAnalysis::Priority highest = 0;
		
			frontier.push_back(block->get_branch_edge()->tail);
			
			if(block->has_fallthrough_edge())
			{
				frontier.push_back(
					block->get_fallthrough_edge()->tail);
			}
		
			// gen6 jumps to the block with the highest priority
			for(TFAnalysis::BlockVector::const_iterator
				stalledBlock = frontier.begin();
				stalledBlock != frontier.end(); ++stalledBlock)
			{
				TFAnalysis::Priority priority =
					tfAnalysis->getPriority(*stalledBlock);
				if(priority >= highest)
				{
					highest    = priority;
					firstBlock = *stalledBlock;
				}
			}
		
			// the reconverge point is the first block in the frontier
			assert(firstBlock != k.cfg()->end());
	
			IdToPCMap::const_iterator reconverge =
				pcs.find(firstBlock->id);
			assert(reconverge != pcs.end());
			instructions[pc].reconvergeInstruction =
				reconverge->second;
			report("   re-converge point "  << reconverge->second
				<< ", " << firstBlock->label());
		}
		else
		{
			// Does this branch need to check for re-convergence?
			// Or: are any of the target's predecessors
			//       in the thread frontier?
			bool needsCheck = false;
			
			for(TFAnalysis::BlockVector::const_iterator
				stalledBlock = frontier.begin();
				stalledBlock != frontier.end(); ++stalledBlock)
			{
				if((*stalledBlock)->id == target->id)
				{
					needsCheck = true;
					report("   needs re-convergence check.");
					break;
				}
			}
			
			instructions[pc].needsReconvergenceCheck = needsCheck;
		}
	}
	
	report(" Updating fallthrough targets");
	for(PCToBlockMap::const_iterator pcAndBlock = fallthroughPCs.begin();
		pcAndBlock != fallthroughPCs.end(); ++pcAndBlock)
	{
		ir::ControlFlowGraph::const_iterator block = pcAndBlock->second;
		unsigned int pc                            = pcAndBlock->first;
		
		const ir::PTXInstruction& ptx = static_cast<
			const ir::PTXInstruction&>(instructions[pc]);
			
		ir::ControlFlowGraph::const_iterator target =
			block->get_fallthrough_edge()->tail;
					
		IdToPCMap::const_iterator targetPC = pcs.find(target->id);
		assert(targetPC != pcs.end());
		
		report("  setting branch target of '" << ptx.toString()
			<< "' to " << targetPC->second);
		
		instructions[pc].branchTargetInstruction = targetPC->second;
	}
	
}
void FunctionInliningPass::_getFunctionsToInline(ir::IRKernel& k)
{
	report(" Finding functions that are eligible for inlining...");

	for(auto block = k.cfg()->begin(); block != k.cfg()->end(); ++block)
	{
		bool linked = false;
		
		for(auto instruction = block->instructions.begin();
			instruction != block->instructions.end(); ++instruction)
		{
			auto ptx = static_cast<ir::PTXInstruction&>(**instruction);
			
			if(ptx.opcode != ir::PTXInstruction::Call) continue;
			
			report("  Examining " << ptx.toString());

			if(ptx.a.addressMode != ir::PTXOperand::FunctionName)
			{
				report("   skipping because it is an indirect call.");
				continue;
			}
			
			// Get the kernel being called if it is in this module
			auto calledKernel = k.module->getKernel(ptx.a.identifier);
			
			// Skip kernels in another module
			if(calledKernel == 0)
			{
				report("   skipping because it is in a different module.");
				continue;
			}
			
			// Skip kernels that are built-in functions
			if(isBuiltin(ptx.a.identifier))
			{
				report("   skipping because it is a reserved keyword.");
				continue;
			}
			
			// Skip kernels that are too large to inline
			if(calledKernel->cfg()->instructionCount() > thresholdToInline)
			{
				report("   skipping because it is too large ("
					<< calledKernel->cfg()->instructionCount()
					<< " > " << thresholdToInline << ").");
				continue;
			}
			
			report("   it is eligible for inlining!");
			
			if(linked)
			{
				_calls.back().linked = true;
			}
			
			_calls.push_back(FunctionCallDescriptor(
				instruction, block, calledKernel));
		
			linked = true;
		}
	}
}
static ir::ControlFlowGraph::iterator convertCallToJumps(
	const BasicBlockMap& newBlocks,
	ir::ControlFlowGraph::iterator functionEntry,
	ir::ControlFlowGraph::iterator functionExit, ir::IRKernel& kernel,
	ir::ControlFlowGraph::instruction_iterator call,
	ir::ControlFlowGraph::iterator block)
{
	// split the block
	auto firstInstructionOfSplitBlock = call;
	++firstInstructionOfSplitBlock;
	
	auto returnBlock = kernel.cfg()->split_block(block,	
		firstInstructionOfSplitBlock, ir::Edge::Invalid);
	kernel.cfg()->remove_edge(block->out_edges.front());	
	
	// add edges
	kernel.cfg()->insert_edge(ir::Edge(block, functionEntry, ir::Edge::Branch));
	kernel.cfg()->insert_edge(ir::Edge(functionExit,
		returnBlock, ir::Edge::Branch));
	
	ir::PTXInstruction& ptxCall = static_cast<ir::PTXInstruction&>(**call);
	
	if(ptxCall.pg.condition != ir::PTXOperand::PT)
	{
		kernel.cfg()->insert_edge(ir::Edge(block,
			returnBlock, ir::Edge::FallThrough));
	}
	else
	{
		ptxCall.uni = true;
	}

	// set branch to function instruction
	
	ptxCall = ir::PTXInstruction(ir::PTXInstruction::Bra);
	
	ptxCall.d.addressMode = ir::PTXOperand::Label;
	ptxCall.d.identifier  = functionEntry->label();
	
	// set all return instructions to branches to the exit node
	for(auto block = newBlocks.begin(); block != newBlocks.end(); ++block)
	{
		for(auto instruction = block->second->instructions.begin();
			instruction != block->second->instructions.end(); ++instruction)
		{
			ir::PTXInstruction& ptx = static_cast<ir::PTXInstruction&>(
				**instruction);
		
			if(ptx.opcode != ir::PTXInstruction::Ret) continue;
			
			ptx = ir::PTXInstruction(ir::PTXInstruction::Bra);

			ptx.d.addressMode = ir::PTXOperand::Label;
			ptx.d.identifier  = functionExit->label();
			
			if(block->second->has_fallthrough_edge() &&
				ptx.pg.condition == ir::PTXOperand::PT)
			{
				auto fallthrough = block->second->get_fallthrough_edge();
				
				ptx.uni = true;
				
				ir::Edge newEdge(fallthrough->head, fallthrough->tail,
					ir::Edge::Branch);
				
				kernel.cfg()->remove_edge(fallthrough);
				kernel.cfg()->insert_edge(newEdge);				
			}
			
			break;
		}
	}
	
	// set branch back after executing function
	auto ret = new ir::PTXInstruction(ir::PTXInstruction::Bra);
	
	ret->uni = true;
	
	ret->d.addressMode = ir::PTXOperand::Label;
	ret->d.identifier  = returnBlock->label();
	
	functionExit->instructions.push_back(ret);
	
	return returnBlock;
}
static void insertAndConnectBlocks(BasicBlockMap& newBlocks,
	ir::ControlFlowGraph::iterator& functionEntry,
	ir::ControlFlowGraph::iterator& functionExit,
	ir::IRKernel& kernel, unsigned int& nextRegister,
	const ir::IRKernel& inlinedKernel)
{
	typedef std::unordered_map<ir::PTXOperand::RegisterType,
		ir::PTXOperand::RegisterType> RegisterMap;
	
	ir::IRKernel copy;
	const ir::IRKernel* inlinedKernelPointer = &inlinedKernel;
	
	// create a copy if the call is recursive
	if(inlinedKernelPointer == &kernel)
	{
		copy = inlinedKernel;
		inlinedKernelPointer = &copy;
	}
	
	//  Insert new blocks
	for(auto block = inlinedKernelPointer->cfg()->begin();
		block != inlinedKernelPointer->cfg()->end(); ++block)
	{
		auto newBlock = kernel.cfg()->clone_block(block);
		
		newBlocks.insert(std::make_pair(block, newBlock));
	}
	
	//  Connect new blocks, rename branch labels
	for(auto block = newBlocks.begin(); block != newBlocks.end(); ++block)
	{
		for(auto edge = block->first->out_edges.begin();
			edge != block->first->out_edges.end(); ++edge)
		{
			auto headBlock = block->second;
			auto tail      = (*edge)->tail;
			
			auto tailBlock = newBlocks.find(tail);
			assert(tailBlock != newBlocks.end());
			
			kernel.cfg()->insert_edge(ir::Edge(headBlock,
				tailBlock->second, (*edge)->type));
			
			if((*edge)->type == ir::Edge::Branch)
			{
				assert(!headBlock->instructions.empty());
				auto instruction = headBlock->instructions.back();
				
				auto branch = static_cast<ir::PTXInstruction*>(instruction);

				if(branch->opcode == ir::PTXInstruction::Ret) continue;

				assertM(branch->opcode == ir::PTXInstruction::Bra, "Expecting "
					<< branch->toString() << " to be a branch");
				
				branch->d.identifier = tailBlock->second->label();
			}
		}
	}
	
	//  Assign copied blocks new registers
	RegisterMap newRegisters;
	
	for(auto block = newBlocks.begin(); block != newBlocks.end(); ++block)
	{
		for(auto instruction = block->second->instructions.begin();
			instruction != block->second->instructions.end(); ++instruction)
		{
			ir::PTXInstruction& ptx = static_cast<ir::PTXInstruction&>(
				**instruction);
		
			ir::PTXOperand* operands[] = {&ptx.pg, &ptx.pq, &ptx.d, &ptx.a,
				&ptx.b, &ptx.c};
				
			for(unsigned int i = 0; i < 6; ++i)
			{
				ir::PTXOperand& operand = *operands[i];
				
				if( operand.addressMode != ir::PTXOperand::Register &&
					operand.addressMode != ir::PTXOperand::Indirect &&
					operand.addressMode != ir::PTXOperand::ArgumentList)
				{
					continue;
				}
				
				if(operand.type != ir::PTXOperand::pred)
				{
					if(operand.array.empty() &&
						operand.addressMode != ir::PTXOperand::ArgumentList)
					{
						auto mapping = newRegisters.find(operand.reg);
						
						if(mapping == newRegisters.end())
						{
							mapping = newRegisters.insert(std::make_pair(
								operand.reg, nextRegister++)).first;
						}
						
						operand.reg = mapping->second;
					}
					else
					{
						for(auto subOperand = operand.array.begin(); 
							subOperand != operand.array.end(); ++subOperand )
						{
							if(!subOperand->isRegister()) continue;
							
							auto mapping = newRegisters.find(subOperand->reg);
						
							if(mapping == newRegisters.end())
							{
								mapping = newRegisters.insert(std::make_pair(
									subOperand->reg, nextRegister++)).first;
							}
						
							subOperand->reg = mapping->second;
						}
					}
				}
				else if(operand.addressMode != ir::PTXOperand::ArgumentList)
				{
					if(operand.condition == ir::PTXOperand::Pred
						|| operand.condition == ir::PTXOperand::InvPred)
					{
						auto mapping = newRegisters.find(operand.reg);
						
						if(mapping == newRegisters.end())
						{
							mapping = newRegisters.insert(std::make_pair(
								operand.reg, nextRegister++)).first;
						}
						
						operand.reg = mapping->second;
					}
				}
			}
		}
	}
	
	//  Assign copied blocks new local variables
	typedef std::unordered_map<std::string, std::string> LocalMap;
	
	LocalMap locals;
	
	for(auto local = inlinedKernel.locals.begin();
		local != inlinedKernel.locals.end(); ++local)
	{
		std::string newName = "_Zinlined_" + local->first;
	
		locals.insert(std::make_pair(local->first, newName));
		
		auto newLocal = kernel.locals.insert(
			std::make_pair(newName, local->second)).first;
		
		newLocal->second.name = newName;
	}
	
	for(auto block = newBlocks.begin(); block != newBlocks.end(); ++block)
	{
		for(auto instruction = block->second->instructions.begin();
			instruction != block->second->instructions.end(); ++instruction)
		{
			ir::PTXInstruction& ptx = static_cast<ir::PTXInstruction&>(
				**instruction);
		
			if(!ptx.mayHaveAddressableOperand()) continue;
		
			ir::PTXOperand* operands[] = {&ptx.pg, &ptx.pq, &ptx.d, &ptx.a,
				&ptx.b, &ptx.c};
				
			for(unsigned int i = 0; i < 6; ++i)
			{
				ir::PTXOperand& operand = *operands[i];
				
				if(operand.addressMode != ir::PTXOperand::Address) continue;
				
				auto local = locals.find(operand.identifier);
				
				if(local == locals.end()) continue;
				
				operand.identifier = local->second;
			}
		}
	}
	
	//  Get the entry and exit points
	auto entryMapping = newBlocks.find(
		inlinedKernelPointer->cfg()->get_entry_block());
	assert(entryMapping != newBlocks.end());
	
	functionEntry = entryMapping->second;
	
	auto exitMapping = newBlocks.find(
		inlinedKernelPointer->cfg()->get_exit_block());
	assert(exitMapping != newBlocks.end());
	
	functionExit = exitMapping->second;
}
bool SimplifyControlFlowGraphPass::_mergeExitBlocks(ir::IRKernel& k)
{
	typedef std::unordered_map<ir::ControlFlowGraph::iterator,
		ir::ControlFlowGraph::instruction_iterator> BlockMap;
	
	report(" Merging exit blocks...");
	
	BlockMap exitBlocks;
	
	// Find all blocks with exit instructions
	for(ir::ControlFlowGraph::iterator block = k.cfg()->begin();
		block != k.cfg()->end(); ++block)
	{
		for(ir::ControlFlowGraph::instruction_iterator
			instruction = block->instructions.begin();
			instruction != block->instructions.end(); ++instruction)
		{
			ir::PTXInstruction& ptx =
				static_cast<ir::PTXInstruction&>(**instruction);
			
			if(ptx.isExit() && ptx.opcode != ir::PTXInstruction::Trap)
			{
				// There should be an edge to the exit block
				assertM(block->find_out_edge(k.cfg()->get_exit_block()) !=
					block->out_edges.end(), "No edge from " << block->label()
					<< " to exit node.");
			
				exitBlocks.insert(std::make_pair(block, instruction));
				break;
			}
		}
	}
	
	// If there is only one/zero blocks, then don't change anything
	if(exitBlocks.size() < 2)
	{
		if(exitBlocks.size() == 1)
		{
			ir::PTXInstruction& ptx =
				static_cast<ir::PTXInstruction&>(**exitBlocks.begin()->second);
			
			if(k.function())
			{
				ptx.opcode = ir::PTXInstruction::Ret;
			}
			else
			{
				ptx.opcode = ir::PTXInstruction::Exit;
			}
		}
		
		return false;
	}
	// Otherwise...
	
	// 1) create a new exit block
	ir::ControlFlowGraph::iterator newExit = k.cfg()->insert_block(
		ir::BasicBlock(k.cfg()->newId()));
	
	ir::BasicBlock::EdgePointerVector deletedEdges =
		k.cfg()->get_exit_block()->in_edges;
	
	// 1a) Create edges targetting the new block
	for(ir::ControlFlowGraph::edge_pointer_iterator edge = deletedEdges.begin();
		edge != deletedEdges.end(); ++edge)
	{
		k.cfg()->insert_edge(ir::Edge((*edge)->head, newExit, (*edge)->type));
		k.cfg()->remove_edge(*edge);
	}
	
	k.cfg()->insert_edge(ir::Edge(newExit, k.cfg()->get_exit_block(),
		ir::Edge::FallThrough));
	
	// 2) Delete the instructions from their blocks
	for(BlockMap::iterator block = exitBlocks.begin();
		block != exitBlocks.end(); ++block)
	{
		report("  merging block " << block->first->label());
		
		// 2a) Insert a branch from blocks with branch edges
		ir::ControlFlowGraph::edge_pointer_iterator edge =
			newExit->find_in_edge(block->first);
		
		if((*edge)->type == ir::Edge::Branch)
		{
			ir::PTXInstruction* newBranch = new ir::PTXInstruction(
				ir::PTXInstruction::Bra, ir::PTXOperand(newExit->label()));
				
			newBranch->uni = true;
		
			block->first->instructions.push_back(newBranch);
		}
		
		delete *block->second;
		block->first->instructions.erase(block->second);
	}
	
	// 3 Add an appropriate exit instruction to the new exit block
	if(k.function())
	{
		newExit->instructions.push_back(
			new ir::PTXInstruction(ir::PTXInstruction::Ret));
	}
	else
	{
		newExit->instructions.push_back(
			new ir::PTXInstruction(ir::PTXInstruction::Exit));
	}
	
	return true;
}