	const DataflowGraph::iterator &block) {
	const Analysis* dfgAnalysis = getAnalysis("DataflowGraphAnalysis");
	assert(dfgAnalysis != 0);

	const DataflowGraph &cdfg =
		static_cast<const DataflowGraph&>(*dfgAnalysis);
	DataflowGraph &dfg = const_cast<DataflowGraph&>(cdfg);
	PostdominatorTree* dtree = (PostdominatorTree*)
	auto postDominator = dfg.getCFGtoDFGMap()[

	block_set divergentBlocks;

	for (auto successor = block->successors().begin();
		successor != block->successors().end(); ++successor) {
		if (*successor == postDominator) continue;
		block_set allDivergentPaths;
		buildDivergentSubgraph(allDivergentPaths, *successor, postDominator);
	return divergentBlocks;
unsigned int DivergenceAnalysis::_numberOfDivergentPathsToPostDominator(
	const DataflowGraph::iterator &block) const {
	const Analysis* dfgAnalysis = getAnalysis("DataflowGraphAnalysis");
	assert(dfgAnalysis != 0);

	const DataflowGraph &cdfg =
		static_cast<const DataflowGraph&>(*dfgAnalysis);
	DataflowGraph &dfg = const_cast<DataflowGraph&>(cdfg);
	PostdominatorTree* dtree = (PostdominatorTree*)
	auto postDominator = dfg.getCFGtoDFGMap()[

	unsigned int divergentPaths = 0;

	for (auto successor = block->successors().begin();
		successor != block->successors().end(); ++successor) {
		if (*successor == postDominator) {
		block_set allDivergentPaths;
		if (doAnyDivergentPathsReachThePostDominator(allDivergentPaths,
			*successor, postDominator)) {
	report("  There are " << divergentPaths << " divergent paths from "
		<< block->label() << " to post-dominator " << postDominator->label());

	return divergentPaths;
void BlockUnificationPass::runOnKernel( ir::Kernel& k )
	InstructionConverter instConv;
	ir::PTXInstruction::ComputeCapability deviceCapability = ir::PTXInstruction::Cap_2_0;
	DataflowGraph::iterator unificationBranch;
	DataflowGraph::iterator unificationTarget1;
	DataflowGraph::iterator unificationTarget2;
	BlockMatcher::MatrixPath bestPath;
	float largestGain = 0.0;

	// analyze kernel for divergence
	DivergenceAnalysis divAnalysis;

	do {
		largestGain = 0.0;

		DataflowGraph::iterator block = k.dfg()->begin();
		for (; block != k.dfg()->end(); ++block) {
			ir::ControlFlowGraph::const_iterator irBlock = block->block();
			DataflowGraph::const_iterator constBlock = block;

			if (irBlock->endsWithConditionalBranch() &&
			) {
				// get the fallthrough block
				DataflowGraph::iterator fallthroughBlock = block->fallthrough();

				// get the branch block
				DataflowGraph::iterator branchBlock = fallthroughBlock;
				DataflowGraph::BlockPointerSet branchTargets = block->targets();
				DataflowGraph::BlockPointerSet::const_iterator it =
				for (; it != branchTargets.end(); ++it) {
					if (*it != fallthroughBlock) {
						branchBlock = *it;
				assertM(branchBlock != fallthroughBlock,
						"Block unification pass error: could not find fallthrough");

				ir::PostdominatorTree* pdomTree = k.pdom_tree();
				ir::ControlFlowGraph::const_iterator postDomBlk =
				bool haveBranch2FallthroughPath = thereIsPathFromB1toB2(
						branchBlock->block(), fallthroughBlock->block(),
						postDomBlk, new std::set<ir::ControlFlowGraph::BasicBlock*>);
				bool haveFallthrough2BranchPath = thereIsPathFromB1toB2(
						fallthroughBlock->block(), branchBlock->block(),
						postDomBlk, new std::set<ir::ControlFlowGraph::BasicBlock*>);
				if (!haveBranch2FallthroughPath && !haveFallthrough2BranchPath) {
					// Calculate branch targets' unification gain
					BlockMatcher::MatrixPath path;
					float gain = BlockMatcher::calculateUnificationGain(
							k.dfg(), *fallthroughBlock, *branchBlock, path,
							instConv, deviceCapability);

					if (gain > largestGain) {
						largestGain = gain;
						unificationBranch = block;
						unificationTarget1 = fallthroughBlock;
						unificationTarget2 = branchBlock;
						bestPath = path;

		if (largestGain > 10.0) {
			// Unify the basic block pair with biggest gain (if there's one)
			cout << ">>>>> unifying blocks: " << unificationTarget1->block()->label
								<< " and " << unificationTarget2->block()->label << std::endl;
			weaveBlocks(unificationBranch, unificationTarget1, unificationTarget2, bestPath, k.dfg());

		// refresh divergence analysis data
	} while (false); //(largestGain > 0.0);
void BlockUnificationPass::weaveBlocks(DataflowGraph::iterator branchBlock, DataflowGraph::iterator target1, DataflowGraph::iterator target2, BlockMatcher::MatrixPath& extractionPath, DataflowGraph* dfg)
	DataflowGraph::iterator oldFallthroughBlock = branchBlock;
	DataflowGraph::iterator oldBranchBlock = branchBlock;

	// get branch predicate
	ir::ControlFlowGraph::const_iterator irBlock = branchBlock->block();
	ir::Instruction* branchInst = irBlock->getTerminator();
	ir::PTXInstruction* branchInstPtx = static_cast<ir::PTXInstruction*>(branchInst);
	ir::PTXOperand* branchPredicate = &(branchInstPtx->pg);

	std::string labelPrefix = "$BBweave_" + target1->block()->label + "_" + target2->block()->label;
	int blockNum = 0;

	// while not consumed path, generate basic blocks
	BlockExtractor extractor(dfg, target1, target2, extractionPath, *branchPredicate);
	while (extractor.hasNext()) {
		if (extractor.nextStep() == BlockMatcher::Match ||
				extractor.nextStep() == BlockMatcher::Substitution) {
			// block label
			std::stringstream blockLabel;
			blockLabel << labelPrefix << "_uni_" << blockNum++;

			// create block
			DataflowGraph::iterator newUnifiedBlock = dfg->insert(oldFallthroughBlock, target1, blockLabel.str());

			// link blocks
			dfg->addEdge(newUnifiedBlock, target2, ir::ControlFlowGraph::Edge::Branch);
			if (oldFallthroughBlock == oldBranchBlock) {
				// oldFallthroughBlock and oldBranchBlock all point to branchBlock.

				// remove oldBranchBlock -> target2 because there's
				// already a edge from branchBlock to newUnifiedBlock
				dfg->removeEdge(oldBranchBlock, target2);
			} else {
				dfg->removeEdge(oldFallthroughBlock, newUnifiedBlock);
				dfg->redirect(oldBranchBlock, target2, newUnifiedBlock);
				dfg->addEdge(oldFallthroughBlock, newUnifiedBlock, ir::ControlFlowGraph::Edge::Branch);

				// add goto in oldFallthroughBlock to newUnifiedBlock
				ir::PTXInstruction gotoPtx(ir::PTXInstruction::Bra);
				ir::PTXOperand gotoLabelOperand(blockLabel.str(), ir::PTXOperand::Label, ir::PTXOperand::s32);
				gotoPtx.uni = true;
				ir::Instruction& gotoInst = gotoPtx;
				dfg->insert(oldFallthroughBlock, gotoInst);

			oldFallthroughBlock = newUnifiedBlock;
			oldBranchBlock = newUnifiedBlock;
		} else {

			// create fallthrough block
			std::stringstream fallthroughLabel;
			fallthroughLabel << labelPrefix << "_ft_" << blockNum++;
			DataflowGraph::iterator newFallthoughBlock = dfg->insert(oldFallthroughBlock, target1, fallthroughLabel.str());

			// create branch block
			std::stringstream branchLabel;
			branchLabel << labelPrefix << "_bra_" << blockNum++;
			DataflowGraph::iterator newBranchBlock = dfg->insert(oldBranchBlock, target2, branchLabel.str());

			// fill blocks
			extractor.extractDivergentBlocks(newFallthoughBlock, newBranchBlock);

			if (oldFallthroughBlock == branchBlock) {
				const ir::PTXOperand braLabelOperand(branchLabel.str(), ir::PTXOperand::Label, ir::PTXOperand::s32);
			} else {
				ir::PTXInstruction braPtx(ir::PTXInstruction::Bra);
				ir::PTXOperand braLabelOperand(branchLabel.str(), ir::PTXOperand::Label, ir::PTXOperand::s32);
				ir::Instruction& bra = braPtx;
				dfg->insert(oldFallthroughBlock, bra);

			// all needed edges were already created in block creation,
			// no more edge manipulation needed
			oldFallthroughBlock = newFallthoughBlock;
			oldBranchBlock = newBranchBlock;

	// remove branch instruction on BranchBlock if needed
	if (branchBlock->targets().size() == 0) {
		// branchBlock does not have branch at end, remove that instruction
		unsigned int branchInstPos = branchBlock->instructions().size() - 1;
		dfg->erase(branchBlock, branchInstPos);

	if (oldFallthroughBlock == oldBranchBlock) {
		// If target1 has no fall-through, then
		// switch branch and fall-through edges.

		if ( !(target1->block()->has_fallthrough_edge()) ) {
			dfg->setEdgeType(oldFallthroughBlock, target1, ir::ControlFlowGraph::BasicBlock::Edge::Branch);
			dfg->setEdgeType(oldFallthroughBlock, target2, ir::ControlFlowGraph::BasicBlock::Edge::FallThrough);
	// weaved block finishes with a divergent section

	// copy target2 targets to a ending divergent fallthrough block
	dfg->copyOutgoingBranchEdges(target1, oldFallthroughBlock);
	// remove target1

	// copy target2 targets to a ending unified block or divergent branch block
	dfg->copyOutgoingBranchEdges(target2, oldBranchBlock);
	// remove target2

	// remove empty blocks or blocks with just a goto
	// TODO: removing this blocks is not essential for correctness, yet it
	// might increase performance.

	// replaces all uses of old registers to use new ones
	// in code after unified basic blocks
	std::cerr << "\n\n\nCalling register replacement\n\n\n";
	replaceRegisters(dfg, extractor);

	// recalculate live in and live out
void DivergenceAnalysis::_findBranches(branch_set& branches)
	Analysis* dfgAnalysis = getAnalysis("DataflowGraphAnalysis");
	assert(dfgAnalysis != 0);

	DataflowGraph &dfg = static_cast<DataflowGraph&>(*dfgAnalysis);

	/* Create a list of branches that can be divergent, that is,
		they are not  bra.uni and have a predicate */
	DataflowGraph::iterator block = dfg.begin();
	DataflowGraph::iterator endBlock = dfg.end();

	/* Post-dominator tree */
	PostdominatorTree *dtree;
	dtree = (PostdominatorTree*) (getAnalysis("PostDominatorTreeAnalysis"));

	report(" Finding branches");
	for (; block != endBlock; ++block) {
		ir::PTXInstruction *ptxInstruction = NULL;

		if (block->instructions().size() > 0) {
			/* Branch instructions can only be the last
			instruction of a basic block */
			DataflowGraph::Instruction& lastInstruction =

			if (typeid(ir::PTXInstruction) == typeid(*(lastInstruction.i))) {
				ptxInstruction =

				if ((ptxInstruction->opcode == ir::PTXInstruction::Bra)) {
					report("  examining " << ptxInstruction->toString());
					if(ptxInstruction->uni == true) { 
						report("   eliminated, uniform...");
					if(lastInstruction.s.size() == 0) {
						report("   eliminated, wrong source count ("
							<< lastInstruction.s.size() << ")...");
					assert(lastInstruction.s.size() == 1);
					DataflowGraph::iterator postDomBlock =
					if (postDomBlock != dfg.end()) {
						BranchInfo newBranch(&(*block), &(*postDomBlock), 
							lastInstruction, _divergGraph);
						report("   is potentially divergent...");
					else {
						report("   eliminated, no post-dominator...");