void FindCliff( size_t nOps, FILE* plot ) { size_t nGroups = 32; size_t nThreadsPerGroup=60; std::vector<GEN::Instruction> ops; ConstructShader(ops,nOps,nThreadsPerGroup); GEN::Encoder enc; HAXWell::Blob isa; isa.SetLength( enc.GetBufferSize(ops.size()) ); isa.SetLength( enc.Encode( isa.GetBytes(), ops.data(), ops.size() ) ); //PrintISA(stdout,isa); int CURBE[HAXWell::MAX_DISPATCH_COUNT][8]; for( size_t i=0; i<HAXWell::MAX_DISPATCH_COUNT; i++ ) for( size_t j=0; j<8; j++ ) CURBE[i][j] = 2*i; HAXWell::ShaderArgs args; args.nDispatchThreadCount = nThreadsPerGroup; args.nSIMDMode = 8; args.nCURBEAllocsPerThread = 1; args.pCURBE = CURBE; args.nIsaLength = isa.GetLength(); args.pIsa = isa.GetBytes(); HAXWell::ShaderHandle hShader = HAXWell::CreateShader(args); HAXWell::BufferHandle hBuffer = HAXWell::CreateBuffer( 0, 32*nGroups*args.nDispatchThreadCount ); HAXWell::TimerHandle h = HAXWell::BeginTimer(); HAXWell::DispatchShader( hShader, &hBuffer,1,nGroups ); HAXWell::EndTimer( h ); HAXWell::Finish(); HAXWell::timer_t nTime = HAXWell::ReadTimer(h); size_t nThreads = nGroups*args.nDispatchThreadCount; unsigned int* pBuff = (unsigned int*)HAXWell::MapBuffer(hBuffer); size_t nAvg=0; for( size_t i=0; i<nThreadsPerGroup*nGroups; i++ ) { unsigned int startlo = pBuff[8*i+1]; unsigned int endlo = pBuff[8*i+3]; nAvg += endlo-startlo; } HAXWell::UnmapBuffer(hBuffer); HAXWell::ReleaseShader(hShader); HAXWell::ReleaseBuffer(hBuffer); double latency = ((double)nAvg) / (nThreadsPerGroup*nGroups); fprintf(plot, "%u, %f, %u\n", 16*ops.size(), latency, ops.size() ); printf("%u, %f, %u\n", 16*ops.size(), latency, ops.size() ); }
HAXWell::timer_t InstructionIssueTest( size_t nRegs, size_t simd ) { size_t nGroups = 128; size_t nThreadsPerGroup=60; std::vector<GEN::Instruction> ops; ConstructShader(ops,nRegs,nThreadsPerGroup, simd); GEN::Encoder enc; HAXWell::Blob isa; isa.SetLength( enc.GetBufferSize(ops.size()) ); isa.SetLength( enc.Encode( isa.GetBytes(), ops.data(), ops.size() ) ); // PrintISA(stdout,isa); int CURBE[HAXWell::MAX_DISPATCH_COUNT][8]; for( size_t i=0; i<HAXWell::MAX_DISPATCH_COUNT; i++ ) for( size_t j=0; j<8; j++ ) CURBE[i][j] = 2*i; HAXWell::ShaderArgs args; args.nDispatchThreadCount = nThreadsPerGroup; args.nSIMDMode = 16;//(simd <= 8) ? 8 : 16; // dispatch mode really doesn't seem to matter args.nCURBEAllocsPerThread = 1; args.pCURBE = CURBE; args.nIsaLength = isa.GetLength(); args.pIsa = isa.GetBytes(); HAXWell::ShaderHandle hShader = HAXWell::CreateShader(args); HAXWell::BufferHandle hBuffer = HAXWell::CreateBuffer( 0, 32*nGroups*args.nDispatchThreadCount ); HAXWell::TimerHandle h = HAXWell::BeginTimer(); HAXWell::DispatchShader( hShader, &hBuffer,1,nGroups ); HAXWell::EndTimer( h ); HAXWell::Finish(); HAXWell::ReleaseBuffer(hBuffer); HAXWell::ReleaseShader(hShader); return HAXWell::ReadTimer(h); }
double ScatterReadTest( size_t nDivergent ) { size_t nGroups = 32; size_t nThreadsPerGroup=60; std::vector<GEN::Instruction> ops; ConstructShader(ops,nThreadsPerGroup); GEN::Encoder enc; HAXWell::Blob isa; isa.SetLength( enc.GetBufferSize(ops.size()) ); isa.SetLength( enc.Encode( isa.GetBytes(), ops.data(), ops.size() ) ); //PrintISA(stdout,isa); int CURBE[HAXWell::MAX_DISPATCH_COUNT][2][8]; for( size_t i=0; i<HAXWell::MAX_DISPATCH_COUNT; i++ ) { // first CURBE reg is offset into timings buffer for( size_t j=0; j<8; j++ ) CURBE[i][0][j] = 2*i; // second CURBE reg is per-channel read offsets for scatter read for( size_t j=0; j<8; j++ ) CURBE[i][1][j] = 8*i + j; // send a certain number of them to completely different cache lines for( size_t j=0; j<nDivergent; j++ ) CURBE[i][1][j] += 16*j; #ifdef SAME_ADDRESS // nah, use zero instead.... for( size_t j=0; j<8; j++ ) CURBE[i][1][j] = 0; #endif } HAXWell::ShaderArgs args; args.nDispatchThreadCount = nThreadsPerGroup; args.nSIMDMode = 16; args.nCURBEAllocsPerThread = 2; args.pCURBE = CURBE; args.nIsaLength = isa.GetLength(); args.pIsa = isa.GetBytes(); HAXWell::ShaderHandle hShader = HAXWell::CreateShader(args); HAXWell::BufferHandle hBuffers[] = { HAXWell::CreateBuffer( 0, 32*nGroups*args.nDispatchThreadCount ), HAXWell::CreateBuffer( 0, 32*nGroups*args.nDispatchThreadCount*1024 ) , }; HAXWell::DispatchShader( hShader, hBuffers,2,nGroups ); HAXWell::Finish(); unsigned int* pBuff = (unsigned int*)HAXWell::MapBuffer(hBuffers[0]); size_t nAvg=0; for( size_t i=0; i<nThreadsPerGroup*nGroups; i++ ) { unsigned int startlo = pBuff[8*i]; unsigned int endlo = pBuff[8*i+2]; nAvg += endlo-startlo; } HAXWell::UnmapBuffer(hBuffers[0]); HAXWell::ReleaseBuffer(hBuffers[0]); HAXWell::ReleaseBuffer(hBuffers[1]); HAXWell::ReleaseShader(hShader); return (nAvg)/(double)(nGroups*nThreadsPerGroup); }