void RemoveBarrierPass::_removeBarrier( analysis::DataflowGraph::iterator block, unsigned int id ) { typedef analysis::DataflowGraph::RegisterSet RegisterSet; analysis::DataflowGraph::InstructionVector::const_iterator _instruction( block->instructions().begin() ); std::advance( _instruction, id ); analysis::DataflowGraph::iterator exitBlock( _dfg().end() ); std::advance( exitBlock, -1 ); ir::PTXInstruction& instruction = static_cast< ir::PTXInstruction& >( *_instruction->i ); bool isBarrier = instruction.opcode == ir::PTXInstruction::Bar; if( isBarrier ) { report( " Converting instruction " << instruction.toString() ); instruction.opcode = ir::PTXInstruction::Call; instruction.tailCall = true; instruction.branchTargetInstruction = -1; instruction.a = ir::PTXOperand( ir::PTXOperand::FunctionName, "_ZOcelotBarrierKernel"); instruction.d.addressMode = ir::PTXOperand::Invalid; report( " Converted to " << instruction.toString() ); } RegisterSet alive = block->alive( _instruction ); analysis::DataflowGraph::iterator bottom = _dfg().split( block, id + 1, false ); _addSpillCode( block, bottom, alive, isBarrier ); _addRestoreCode( bottom, alive ); _dfg().redirect( block, bottom, exitBlock ); if( !isBarrier && instruction.pg.condition != ir::PTXOperand::PT ) { _dfg().target( block, bottom, true ); } _addEntryPoint( bottom ); }
void RemoveBarrierPass::runOnKernel( ir::IRKernel& k ) { report( "Removing barriers from kernel " << k.name ); assertM( k.ISA == ir::Instruction::PTX, "This pass is valid for PTX kernels only." ); _reentryPoint = 1; _spillBytes = 1; _kernel = static_cast< ir::PTXKernel* >( &k ); for( analysis::DataflowGraph::iterator block = _dfg().begin(); block != _dfg().end(); ++block ) { _runOnBlock( block ); } _addLocalVariables(); }
void AffineLinearScan::_spill() { if(_shared.bytes() > 0) { AffineRegister::warpPosition = _dfg().newRegister(); } LinearScanRegisterAllocationPass::_spill(); }
void RemoveBarrierPass::_addEntryPoint( analysis::DataflowGraph::iterator block ) { analysis::DataflowGraph::iterator entry = _dfg().insert(_dfg().begin()); ir::PTXInstruction move( ir::PTXInstruction::Mov ); move.type = ir::PTXOperand::u32; move.addressSpace = ir::PTXInstruction::Local; move.a.identifier = "_Zocelot_resume_point"; move.a.addressMode = ir::PTXOperand::Address; move.a.type = ir::PTXOperand::u32; move.d.reg = _tempRegister(); move.d.addressMode = ir::PTXOperand::Register; move.d.type = ir::PTXOperand::u32; _dfg().insert( entry, move, 0 ); ir::PTXInstruction load( ir::PTXInstruction::Ld ); load.addressSpace = ir::PTXInstruction::Local; load.type = ir::PTXOperand::u32; load.a = move.d; load.d.reg = _tempRegister(); load.d.addressMode = ir::PTXOperand::Register; load.d.type = ir::PTXOperand::u32; _dfg().insert( entry, load, 1 ); ir::PTXInstruction setp( ir::PTXInstruction::SetP ); setp.type = ir::PTXOperand::u32; setp.comparisonOperator = ir::PTXInstruction::Eq; setp.d.reg = _tempRegister(); setp.d.addressMode = ir::PTXOperand::Register; setp.d.type = ir::PTXOperand::pred; setp.a = load.d; setp.b.addressMode = ir::PTXOperand::Immediate; setp.b.type = ir::PTXOperand::u32; setp.b.imm_uint = block->id(); _dfg().insert( entry, setp, 2 ); ir::PTXInstruction branch( ir::PTXInstruction::Bra ); branch.d.addressMode = ir::PTXOperand::Label; branch.d.identifier = block->label(); branch.pg = setp.d; _dfg().insert( entry, branch, 3 ); _dfg().target( entry, block ); }
void RemoveBarrierPass::_addRestoreCode( analysis::DataflowGraph::iterator block, const analysis::DataflowGraph::RegisterSet& alive ) { unsigned int bytes = 0; ir::PTXInstruction move ( ir::PTXInstruction::Mov ); move.type = ir::PTXOperand::u32; move.addressSpace = ir::PTXInstruction::Local; move.a.identifier = "_Zocelot_spill_area"; move.a.addressMode = ir::PTXOperand::Address; move.a.type = ir::PTXOperand::u32; move.d.reg = _tempRegister(); move.d.addressMode = ir::PTXOperand::Register; move.d.type = ir::PTXOperand::u32; for( analysis::DataflowGraph::RegisterSet::const_iterator reg = alive.begin(); reg != alive.end(); ++reg ) { ir::PTXInstruction load( ir::PTXInstruction::Ld ); load.type = reg->type; load.addressSpace = ir::PTXInstruction::Local; load.a.addressMode = ir::PTXOperand::Indirect; load.a.reg = move.d.reg; load.a.type = ir::PTXOperand::u32; load.a.offset = bytes; bytes += ir::PTXOperand::bytes( load.type ); load.d.addressMode = ir::PTXOperand::Register; load.d.type = reg->type; load.d.reg = reg->id; _dfg().insert( block, load, 0 ); } _dfg().insert( block, move, 0 ); }
void RemoveBarrierPass::_runOnBlock( analysis::DataflowGraph::iterator block ) { typedef analysis::DataflowGraph::InstructionVector::const_iterator const_iterator; for( const_iterator _instruction = block->instructions().begin(); _instruction != block->instructions().end(); ++_instruction ) { ir::PTXInstruction& instruction = static_cast< ir::PTXInstruction& >( *_instruction->i ); if( instruction.opcode == ir::PTXInstruction::Bar || ( instruction.opcode == ir::PTXInstruction::Call && !instruction.tailCall ) ) { #if 0 if( _externals != 0 && instruction.opcode == ir::PTXInstruction::Call ) { if( _externals->find( instruction.a.identifier ) != 0 ) { report( "Skipping external call " << instruction.toString() ); continue; } } #endif unsigned int bytes = _spillBytes; _spillBytes = 1; usesBarriers = true; _removeBarrier( block, std::distance( const_iterator( block->instructions().begin() ), _instruction ) ); _spillBytes = std::max( bytes, _spillBytes ); ++_reentryPoint; _dfg().compute(); break; } } }
void RemoveBarrierPass::_addSpillCode( analysis::DataflowGraph::iterator block, analysis::DataflowGraph::iterator target, const analysis::DataflowGraph::RegisterSet& alive, bool isBarrier ) { unsigned int bytes = 0; ir::PTXInstruction move ( ir::PTXInstruction::Mov ); move.type = ir::PTXOperand::u32; move.addressSpace = ir::PTXInstruction::Local; move.a.identifier = "_Zocelot_spill_area"; move.a.addressMode = ir::PTXOperand::Address; move.a.type = ir::PTXOperand::u32; move.d.reg = _tempRegister(); move.d.addressMode = ir::PTXOperand::Register; move.d.type = ir::PTXOperand::u32; _dfg().insert( block, move, block->instructions().size() - 1 ); report( " Saving " << alive.size() << " Registers" ); for( analysis::DataflowGraph::RegisterSet::const_iterator reg = alive.begin(); reg != alive.end(); ++reg ) { report( " r" << reg->id << " (" << ir::PTXOperand::bytes( reg->type ) << " bytes)" ); ir::PTXInstruction save( ir::PTXInstruction::St ); save.type = reg->type; save.addressSpace = ir::PTXInstruction::Local; save.d.addressMode = ir::PTXOperand::Indirect; save.d.reg = move.d.reg; save.d.type = ir::PTXOperand::u32; save.d.offset = bytes; bytes += ir::PTXOperand::bytes( save.type ); save.a.addressMode = ir::PTXOperand::Register; save.a.type = reg->type; save.a.reg = reg->id; _dfg().insert( block, save, block->instructions().size() - 1 ); } _spillBytes = std::max( bytes, _spillBytes ); move.type = ir::PTXOperand::u32; move.addressSpace = ir::PTXInstruction::Local; move.a.identifier = "_Zocelot_resume_point"; move.a.addressMode = ir::PTXOperand::Address; move.a.type = ir::PTXOperand::u32; move.d.reg = _tempRegister(); move.d.addressMode = ir::PTXOperand::Register; move.d.type = ir::PTXOperand::u32; _dfg().insert( block, move, block->instructions().size() - 1 ); ir::PTXInstruction save( ir::PTXInstruction::St ); save.type = ir::PTXOperand::u32; save.addressSpace = ir::PTXInstruction::Local; save.d.addressMode = ir::PTXOperand::Indirect; save.d.reg = move.d.reg; save.d.type = ir::PTXOperand::u32; save.a.addressMode = ir::PTXOperand::Immediate; save.a.type = ir::PTXOperand::u32; save.a.imm_uint = target->id(); _dfg().insert( block, save, block->instructions().size() - 1 ); if( isBarrier ) { move.d.reg = _tempRegister(); move.a.identifier = "_Zocelot_barrier_next_kernel"; _dfg().insert( block, move, block->instructions().size() - 1 ); save.d.reg = move.d.reg; save.a.imm_uint = _kernelId; _dfg().insert( block, save, block->instructions().size() - 1 ); } }
analysis::DataflowGraph::RegisterId RemoveBarrierPass::_tempRegister() { return _dfg().newRegister(); }
void AffineLinearScan::_extendStack() { _shared.declaration(_kernel->locals, MAX_WARPS); reportE(INFO, "Kernel " << _kernel->name << " requires " << _shared.bytes() << " bytes of shared memory per warp, total of " << MAX_WARPS * _shared.bytes() << '(' << MAX_WARPS << " warps)"); LinearScanRegisterAllocationPass::_extendStack(); reportE(DEBUG, "Writing warp local memory stack access information"); if(_shared.bytes() == 0) return; /* warpid = (size_x * ( size_y * z + y ) + x) >> 5 * a = size_y * b = z * c = y * a = mad a z c * b = size_x * c = x * a = mad a b c * a = shr a 5 (>>5 == /32) * memPosition = memInitialPosition [ warpid * bytesPerWarp ] */ analysis::DataflowGraph::iterator block = _dfg().begin(); RegisterId a, b, c; /* Use a AffineRegister temporary register of type u32 if available */ if(AffineRegister::tempRegisters.count(ir::PTXOperand::DataType::u32) != 0) { a = AffineRegister::tempRegisters[ir::PTXOperand::DataType::u32]; } else { a = _dfg().newRegister(); } b = _dfg().newRegister(); /* If memory size is 32 bits, can use warpPosition variable as temporary */ if(_m->addressSize() == 32) { c = AffineRegister::warpPosition; } else { c = _dfg().newRegister(); } // size_y = %ntid.y ir::PTXInstruction sizeY(ir::PTXInstruction::Mov); sizeY.d = ir::PTXOperand(ir::PTXOperand::Register, ir::PTXOperand::DataType::u32, a); sizeY.a = ir::PTXOperand(ir::PTXOperand::ntid, ir::PTXOperand::iy, ir::PTXOperand::u32); sizeY.type = ir::PTXOperand::DataType::u32; _dfg().insert(block, sizeY, 0); // z = %tid.z ir::PTXInstruction z(ir::PTXInstruction::Mov); z.d = ir::PTXOperand(ir::PTXOperand::Register, ir::PTXOperand::DataType::u32, b); z.a = ir::PTXOperand(ir::PTXOperand::tid, ir::PTXOperand::iz, ir::PTXOperand::u32); z.type = ir::PTXOperand::DataType::u32; _dfg().insert(block, z, 1); // y = %tid.y ir::PTXInstruction y(ir::PTXInstruction::Mov); y.d = ir::PTXOperand(ir::PTXOperand::Register, ir::PTXOperand::DataType::u32, c); y.a = ir::PTXOperand(ir::PTXOperand::tid, ir::PTXOperand::iy, ir::PTXOperand::u32); y.type = ir::PTXOperand::DataType::u32; _dfg().insert(block, y, 2); ir::PTXInstruction mad1(ir::PTXInstruction::Mad); mad1.d = sizeY.d; mad1.a = sizeY.d; mad1.b = z.d; mad1.c = y.d; mad1.type = ir::PTXOperand::DataType::u32; mad1.modifier = ir::PTXInstruction::Modifier::lo; _dfg().insert(block, mad1, 3); // size_x = %ntid.x ir::PTXInstruction sizeX(ir::PTXInstruction::Mov); sizeX.d = z.d; sizeX.a = ir::PTXOperand(ir::PTXOperand::ntid, ir::PTXOperand::ix, ir::PTXOperand::u32); sizeX.type = ir::PTXOperand::DataType::u32; _dfg().insert(block, sizeX, 4); // x = %tid.x ir::PTXInstruction x(ir::PTXInstruction::Mov); x.d = y.d; x.a = ir::PTXOperand(ir::PTXOperand::tid, ir::PTXOperand::ix, ir::PTXOperand::u32); x.type = ir::PTXOperand::DataType::u32; _dfg().insert(block, x, 5); // 1) warpid = size_x * size_y ir::PTXInstruction mad2(ir::PTXInstruction::Mad); mad2.d = mad1.d; mad2.a = mad1.d; mad2.b = sizeX.d; mad2.c = x.d; mad2.type = ir::PTXOperand::DataType::u32; mad2.modifier = ir::PTXInstruction::Modifier::lo; _dfg().insert(block, mad2, 6); // 5) warpid = [size_x * y + size_x * size_y * z + x] >> 5 ir::PTXInstruction shr(ir::PTXInstruction::Shr); shr.d = mad2.d; shr.a = mad2.d; shr.b = ir::PTXOperand(5); shr.type = ir::PTXOperand::DataType::u32; _dfg().insert(block, shr, 7); // 6) position = warpid * stride ir::PTXInstruction position(ir::PTXInstruction::Mul); position.d = shr.d; position.a = shr.d; position.b = ir::PTXOperand(_shared.bytes()); position.type = ir::PTXOperand::DataType::u32; position.modifier = ir::PTXInstruction::Modifier::lo; _dfg().insert(block, position, 8); //%memoryStart = stack name; ir::PTXInstruction memoryStart(ir::PTXInstruction::Mov); memoryStart.a = ir::PTXOperand(_shared.name() + "[" + position.d.toString() + "]"); if(_m->addressSize() == 32) { memoryStart.d = x.d; memoryStart.type = ir::PTXOperand::DataType::u32; } else { memoryStart.d = ir::PTXOperand(ir::PTXOperand::Register, ir::PTXOperand::DataType::u64, AffineRegister::warpPosition); memoryStart.type = ir::PTXOperand::DataType::u64; } _dfg().insert(block, memoryStart, 9); }