void RemoveBarrierPass::_removeBarrier( analysis::DataflowGraph::iterator block, 
	unsigned int id )
{
	typedef analysis::DataflowGraph::RegisterSet RegisterSet;
	
	analysis::DataflowGraph::InstructionVector::const_iterator 
		_instruction( block->instructions().begin() );
	std::advance( _instruction, id );
	analysis::DataflowGraph::iterator exitBlock( _dfg().end() );
	std::advance( exitBlock, -1 );

	ir::PTXInstruction& instruction = static_cast< ir::PTXInstruction& >( 
		*_instruction->i );

	bool isBarrier = instruction.opcode == ir::PTXInstruction::Bar;

	if( isBarrier )
	{
		report( "  Converting instruction " << instruction.toString() );
		instruction.opcode = ir::PTXInstruction::Call;
		instruction.tailCall = true;
		instruction.branchTargetInstruction = -1;
		instruction.a = ir::PTXOperand(
			ir::PTXOperand::FunctionName, "_ZOcelotBarrierKernel");
		instruction.d.addressMode = ir::PTXOperand::Invalid;

		report( "   Converted to " << instruction.toString() );		
	}
	
	RegisterSet alive = block->alive( _instruction );
	
	analysis::DataflowGraph::iterator bottom = _dfg().split( block, 
		id + 1, false );

	_addSpillCode( block, bottom, alive, isBarrier );
	_addRestoreCode( bottom, alive );
	
	_dfg().redirect( block, bottom, exitBlock );
	
	if( !isBarrier && instruction.pg.condition != ir::PTXOperand::PT )
	{
		_dfg().target( block, bottom, true );
	}
	
	_addEntryPoint( bottom );
}
void RemoveBarrierPass::runOnKernel( ir::IRKernel& k )
{
	report( "Removing barriers from kernel " << k.name );
	assertM( k.ISA == ir::Instruction::PTX, 
		"This pass is valid for PTX kernels only." );
	_reentryPoint = 1;
	_spillBytes = 1;
	_kernel = static_cast< ir::PTXKernel* >( &k );
	
	for( analysis::DataflowGraph::iterator block = _dfg().begin(); 
		block != _dfg().end(); ++block )
	{
		_runOnBlock( block );
	}
	
	_addLocalVariables();
}
void AffineLinearScan::_spill()
{
	if(_shared.bytes() > 0)
	{
		AffineRegister::warpPosition = _dfg().newRegister();
	}

	LinearScanRegisterAllocationPass::_spill();
}
void RemoveBarrierPass::_addEntryPoint(
	analysis::DataflowGraph::iterator block )
{
	analysis::DataflowGraph::iterator entry = _dfg().insert(_dfg().begin());
			
	ir::PTXInstruction move( ir::PTXInstruction::Mov );
	
	move.type = ir::PTXOperand::u32;
	move.addressSpace = ir::PTXInstruction::Local;
	move.a.identifier = "_Zocelot_resume_point";
	move.a.addressMode = ir::PTXOperand::Address;
	move.a.type = ir::PTXOperand::u32;
	
	move.d.reg = _tempRegister();
	move.d.addressMode = ir::PTXOperand::Register;
	move.d.type = ir::PTXOperand::u32;
	
	_dfg().insert( entry, move, 0 );

	ir::PTXInstruction load( ir::PTXInstruction::Ld );

	load.addressSpace = ir::PTXInstruction::Local;
	load.type = ir::PTXOperand::u32;
	load.a = move.d;

	load.d.reg = _tempRegister();
	load.d.addressMode = ir::PTXOperand::Register;
	load.d.type = ir::PTXOperand::u32;
	
	_dfg().insert( entry, load, 1 );

	ir::PTXInstruction setp( ir::PTXInstruction::SetP );
	
	setp.type = ir::PTXOperand::u32;
	setp.comparisonOperator = ir::PTXInstruction::Eq;

	setp.d.reg = _tempRegister();
	setp.d.addressMode = ir::PTXOperand::Register;
	setp.d.type = ir::PTXOperand::pred;
	
	setp.a = load.d;
	
	setp.b.addressMode = ir::PTXOperand::Immediate;
	setp.b.type = ir::PTXOperand::u32;
	setp.b.imm_uint = block->id();
	
	_dfg().insert( entry, setp, 2 );
	
	ir::PTXInstruction branch( ir::PTXInstruction::Bra );
	
	branch.d.addressMode = ir::PTXOperand::Label;
	branch.d.identifier = block->label();
	branch.pg = setp.d;

	_dfg().insert( entry, branch, 3 );
	
	_dfg().target( entry, block );
}
void RemoveBarrierPass::_addRestoreCode(
	analysis::DataflowGraph::iterator block, 
	const analysis::DataflowGraph::RegisterSet& alive )
{
	unsigned int bytes = 0;

	ir::PTXInstruction move ( ir::PTXInstruction::Mov );
	
	move.type = ir::PTXOperand::u32;
	move.addressSpace = ir::PTXInstruction::Local;
	move.a.identifier = "_Zocelot_spill_area";
	move.a.addressMode = ir::PTXOperand::Address;
	move.a.type = ir::PTXOperand::u32;
	
	move.d.reg = _tempRegister();
	move.d.addressMode = ir::PTXOperand::Register;
	move.d.type = ir::PTXOperand::u32;

	for( analysis::DataflowGraph::RegisterSet::const_iterator 
		reg = alive.begin(); reg != alive.end(); ++reg )
	{
		ir::PTXInstruction load( 
			ir::PTXInstruction::Ld );
	
		load.type = reg->type;
		load.addressSpace = ir::PTXInstruction::Local;
		
		load.a.addressMode = ir::PTXOperand::Indirect;
		load.a.reg = move.d.reg;
		load.a.type = ir::PTXOperand::u32;
		load.a.offset = bytes;
		
		bytes += ir::PTXOperand::bytes( load.type );
	
		load.d.addressMode = ir::PTXOperand::Register;
		load.d.type = reg->type;
		load.d.reg = reg->id;
		
		_dfg().insert( block, load, 0 );
	}

	_dfg().insert( block, move, 0 );
}
Beispiel #6
0
void RemoveBarrierPass::_runOnBlock( analysis::DataflowGraph::iterator block )
{
	typedef analysis::DataflowGraph::InstructionVector::const_iterator
		const_iterator;
	for( const_iterator _instruction = block->instructions().begin(); 
		_instruction != block->instructions().end(); ++_instruction )
	{
		ir::PTXInstruction& instruction = static_cast< 
			ir::PTXInstruction& >( *_instruction->i );
		if( instruction.opcode == ir::PTXInstruction::Bar
			|| ( instruction.opcode == ir::PTXInstruction::Call 
				&& !instruction.tailCall ) )
		{
#if 0
			if( _externals != 0
				&& instruction.opcode == ir::PTXInstruction::Call )
			{
				if( _externals->find( instruction.a.identifier ) != 0 )
				{
					report( "Skipping external call "
						<< instruction.toString() );
					continue;
				}
			}
#endif

			unsigned int bytes = _spillBytes;
			_spillBytes = 1;
			usesBarriers = true;
			_removeBarrier( block, std::distance( 
				const_iterator( block->instructions().begin() ),
				_instruction ) );
			_spillBytes = std::max( bytes, _spillBytes );
			++_reentryPoint;
			_dfg().compute();
			break;
		}
	}
}
void RemoveBarrierPass::_addSpillCode( analysis::DataflowGraph::iterator block, 
	analysis::DataflowGraph::iterator target, 
	const analysis::DataflowGraph::RegisterSet& alive, bool isBarrier )
{
	unsigned int bytes = 0;
	
	ir::PTXInstruction move ( ir::PTXInstruction::Mov );
	
	move.type = ir::PTXOperand::u32;
	move.addressSpace = ir::PTXInstruction::Local;
	move.a.identifier = "_Zocelot_spill_area";
	move.a.addressMode = ir::PTXOperand::Address;
	move.a.type = ir::PTXOperand::u32;
	
	move.d.reg = _tempRegister();
	move.d.addressMode = ir::PTXOperand::Register;
	move.d.type = ir::PTXOperand::u32;
	
	_dfg().insert( block, move, block->instructions().size() - 1 );

	report( "   Saving " << alive.size() << " Registers" );
	
	for( analysis::DataflowGraph::RegisterSet::const_iterator 
		reg = alive.begin(); reg != alive.end(); ++reg )
	{
		report( "    r" << reg->id << " (" 
			<< ir::PTXOperand::bytes( reg->type ) << " bytes)" );
		ir::PTXInstruction save( ir::PTXInstruction::St );
	
		save.type = reg->type;
		save.addressSpace = ir::PTXInstruction::Local;

		save.d.addressMode = ir::PTXOperand::Indirect;
		save.d.reg = move.d.reg;
		save.d.type = ir::PTXOperand::u32;
		save.d.offset = bytes;
		
		bytes += ir::PTXOperand::bytes( save.type );
	
		save.a.addressMode = ir::PTXOperand::Register;
		save.a.type = reg->type;
		save.a.reg = reg->id;
		
		_dfg().insert( block, save, 
			block->instructions().size() - 1 );
	}
	
	_spillBytes = std::max( bytes, _spillBytes );
	
	move.type = ir::PTXOperand::u32;
	move.addressSpace = ir::PTXInstruction::Local;
	move.a.identifier = "_Zocelot_resume_point";
	move.a.addressMode = ir::PTXOperand::Address;
	move.a.type = ir::PTXOperand::u32;
	
	move.d.reg = _tempRegister();
	move.d.addressMode = ir::PTXOperand::Register;
	move.d.type = ir::PTXOperand::u32;
	
	_dfg().insert( block, move, block->instructions().size() - 1 );
	
	ir::PTXInstruction save( ir::PTXInstruction::St );

	save.type = ir::PTXOperand::u32;
	save.addressSpace = ir::PTXInstruction::Local;

	save.d.addressMode = ir::PTXOperand::Indirect;
	save.d.reg = move.d.reg;
	save.d.type = ir::PTXOperand::u32;

	save.a.addressMode = ir::PTXOperand::Immediate;
	save.a.type = ir::PTXOperand::u32;
	save.a.imm_uint = target->id();
	
	_dfg().insert( block, save, block->instructions().size() - 1 );

	if( isBarrier )
	{
		move.d.reg = _tempRegister();
		move.a.identifier = "_Zocelot_barrier_next_kernel";
	
		_dfg().insert( block, move,
			block->instructions().size() - 1 );

		save.d.reg = move.d.reg;
		save.a.imm_uint = _kernelId;

		_dfg().insert( block, save,
			block->instructions().size() - 1 );
	}
}
analysis::DataflowGraph::RegisterId RemoveBarrierPass::_tempRegister()
{
	return _dfg().newRegister();
}
void AffineLinearScan::_extendStack()
{
	_shared.declaration(_kernel->locals, MAX_WARPS);
	reportE(INFO, "Kernel " << _kernel->name << " requires " << _shared.bytes()
		<< " bytes of shared memory per warp, total of "
		<< MAX_WARPS * _shared.bytes() << '(' << MAX_WARPS << " warps)");
	LinearScanRegisterAllocationPass::_extendStack();
	reportE(DEBUG, "Writing warp local memory stack access information");
	
	if(_shared.bytes() == 0) return;
	/* warpid = (size_x * ( size_y * z + y ) + x) >> 5
	 * a = size_y
	 * b = z
	 * c = y
	 * a = mad a z c
	 * b = size_x
	 * c = x
	 * a = mad a b c
	 * a = shr a 5 (>>5 == /32)
	 * memPosition = memInitialPosition [ warpid * bytesPerWarp ]
	 */
	analysis::DataflowGraph::iterator block = _dfg().begin();
	RegisterId a, b, c;

	/* Use a AffineRegister temporary register of type u32 if available */
	if(AffineRegister::tempRegisters.count(ir::PTXOperand::DataType::u32) != 0)
	{
		a = AffineRegister::tempRegisters[ir::PTXOperand::DataType::u32];
	}
	else
	{
		a = _dfg().newRegister();
	}
	
	b = _dfg().newRegister();

	/* If memory size is 32 bits, can use warpPosition variable as temporary */
	if(_m->addressSize() == 32)
	{
		c = AffineRegister::warpPosition;
	}
	else
	{
		c = _dfg().newRegister();
	}
	
	// size_y = %ntid.y
	ir::PTXInstruction sizeY(ir::PTXInstruction::Mov);
	sizeY.d = ir::PTXOperand(ir::PTXOperand::Register,
		ir::PTXOperand::DataType::u32, a);
	sizeY.a = ir::PTXOperand(ir::PTXOperand::ntid,
		ir::PTXOperand::iy, ir::PTXOperand::u32);
	sizeY.type = ir::PTXOperand::DataType::u32;
	_dfg().insert(block, sizeY, 0);

	// z = %tid.z
	ir::PTXInstruction z(ir::PTXInstruction::Mov);
	z.d = ir::PTXOperand(ir::PTXOperand::Register,
		ir::PTXOperand::DataType::u32, b);
	z.a = ir::PTXOperand(ir::PTXOperand::tid,
		ir::PTXOperand::iz, ir::PTXOperand::u32);
	z.type = ir::PTXOperand::DataType::u32;
	_dfg().insert(block, z, 1);

	// y = %tid.y
	ir::PTXInstruction y(ir::PTXInstruction::Mov);
	y.d = ir::PTXOperand(ir::PTXOperand::Register,
		ir::PTXOperand::DataType::u32, c);
	y.a = ir::PTXOperand(ir::PTXOperand::tid,
		ir::PTXOperand::iy, ir::PTXOperand::u32);
	y.type = ir::PTXOperand::DataType::u32;
	_dfg().insert(block, y, 2);

	ir::PTXInstruction mad1(ir::PTXInstruction::Mad);
	mad1.d = sizeY.d;
	mad1.a = sizeY.d;
	mad1.b = z.d;
	mad1.c = y.d;
	mad1.type = ir::PTXOperand::DataType::u32;
	mad1.modifier = ir::PTXInstruction::Modifier::lo;
	_dfg().insert(block, mad1, 3);

	// size_x = %ntid.x
	ir::PTXInstruction sizeX(ir::PTXInstruction::Mov);
	sizeX.d = z.d;
	sizeX.a = ir::PTXOperand(ir::PTXOperand::ntid,
		ir::PTXOperand::ix, ir::PTXOperand::u32);
	sizeX.type = ir::PTXOperand::DataType::u32;
	_dfg().insert(block, sizeX, 4);

	// x = %tid.x
	ir::PTXInstruction x(ir::PTXInstruction::Mov);
	x.d = y.d;
	x.a = ir::PTXOperand(ir::PTXOperand::tid,
		ir::PTXOperand::ix, ir::PTXOperand::u32);
	x.type = ir::PTXOperand::DataType::u32;
	_dfg().insert(block, x, 5);

	// 1) warpid = size_x * size_y
	ir::PTXInstruction mad2(ir::PTXInstruction::Mad);
	mad2.d = mad1.d;
	mad2.a = mad1.d;
	mad2.b = sizeX.d;
	mad2.c = x.d;
	mad2.type = ir::PTXOperand::DataType::u32;
	mad2.modifier = ir::PTXInstruction::Modifier::lo;
	_dfg().insert(block, mad2, 6);

	// 5) warpid = [size_x * y + size_x * size_y * z + x] >> 5
	ir::PTXInstruction shr(ir::PTXInstruction::Shr);
	shr.d = mad2.d;
	shr.a = mad2.d;
	shr.b = ir::PTXOperand(5);
	shr.type = ir::PTXOperand::DataType::u32;
	_dfg().insert(block, shr, 7);

	// 6) position = warpid * stride
	ir::PTXInstruction position(ir::PTXInstruction::Mul);
	position.d = shr.d;
	position.a = shr.d;
	position.b = ir::PTXOperand(_shared.bytes());
	position.type = ir::PTXOperand::DataType::u32;
	position.modifier = ir::PTXInstruction::Modifier::lo;
	_dfg().insert(block, position, 8);

	//%memoryStart = stack name;
	ir::PTXInstruction memoryStart(ir::PTXInstruction::Mov);
	memoryStart.a = ir::PTXOperand(_shared.name() +
		"[" + position.d.toString() + "]");
	if(_m->addressSize() == 32)
	{
		memoryStart.d = x.d;
		memoryStart.type = ir::PTXOperand::DataType::u32;
	}
	else
	{
		memoryStart.d = ir::PTXOperand(ir::PTXOperand::Register,
			ir::PTXOperand::DataType::u64, AffineRegister::warpPosition);
		memoryStart.type = ir::PTXOperand::DataType::u64;
	}
	_dfg().insert(block, memoryStart, 9);

}