void FindCliff( size_t nOps, FILE* plot )
{
    size_t nGroups = 32;
    size_t nThreadsPerGroup=60;
    std::vector<GEN::Instruction> ops;
    ConstructShader(ops,nOps,nThreadsPerGroup);

    GEN::Encoder enc;
    HAXWell::Blob isa;
    isa.SetLength( enc.GetBufferSize(ops.size()) );
    isa.SetLength( enc.Encode( isa.GetBytes(), ops.data(), ops.size() ) );

    //PrintISA(stdout,isa);

    int CURBE[HAXWell::MAX_DISPATCH_COUNT][8];
    for( size_t i=0; i<HAXWell::MAX_DISPATCH_COUNT; i++ )
        for( size_t j=0; j<8; j++ )
            CURBE[i][j] = 2*i;


    HAXWell::ShaderArgs args;
    args.nDispatchThreadCount = nThreadsPerGroup;
    args.nSIMDMode = 8;
    args.nCURBEAllocsPerThread = 1;
    args.pCURBE = CURBE;
    args.nIsaLength = isa.GetLength();
    args.pIsa = isa.GetBytes();
    HAXWell::ShaderHandle hShader = HAXWell::CreateShader(args);

    HAXWell::BufferHandle hBuffer = HAXWell::CreateBuffer( 0, 32*nGroups*args.nDispatchThreadCount );

    HAXWell::TimerHandle h = HAXWell::BeginTimer();
    HAXWell::DispatchShader( hShader, &hBuffer,1,nGroups );
    HAXWell::EndTimer( h );
    
    HAXWell::Finish();

    HAXWell::timer_t nTime = HAXWell::ReadTimer(h);
    
    size_t nThreads = nGroups*args.nDispatchThreadCount;

    unsigned int* pBuff = (unsigned int*)HAXWell::MapBuffer(hBuffer);
  
    size_t nAvg=0;
    for( size_t i=0; i<nThreadsPerGroup*nGroups; i++ )
    {
        unsigned int startlo = pBuff[8*i+1];
        unsigned int endlo = pBuff[8*i+3];
        nAvg += endlo-startlo;
    }

    HAXWell::UnmapBuffer(hBuffer);
    HAXWell::ReleaseShader(hShader);
    HAXWell::ReleaseBuffer(hBuffer);
    double latency = ((double)nAvg) / (nThreadsPerGroup*nGroups);
    
    fprintf(plot, "%u, %f, %u\n", 16*ops.size(), latency,  ops.size() );
    printf("%u, %f, %u\n", 16*ops.size(), latency, ops.size() );

}
Beispiel #2
0
HAXWell::timer_t InstructionIssueTest( size_t nRegs, size_t simd )
{
    size_t nGroups = 128;
    size_t nThreadsPerGroup=60;
    std::vector<GEN::Instruction> ops;
    ConstructShader(ops,nRegs,nThreadsPerGroup, simd);

    GEN::Encoder enc;
    HAXWell::Blob isa;
    isa.SetLength( enc.GetBufferSize(ops.size()) );
    isa.SetLength( enc.Encode( isa.GetBytes(), ops.data(), ops.size() ) );

   // PrintISA(stdout,isa);

    int CURBE[HAXWell::MAX_DISPATCH_COUNT][8];
    for( size_t i=0; i<HAXWell::MAX_DISPATCH_COUNT; i++ )
        for( size_t j=0; j<8; j++ )
            CURBE[i][j] = 2*i;


    HAXWell::ShaderArgs args;
    args.nDispatchThreadCount = nThreadsPerGroup;
    args.nSIMDMode = 16;//(simd <= 8) ? 8 : 16; // dispatch mode really doesn't seem to matter
    args.nCURBEAllocsPerThread = 1;
    args.pCURBE = CURBE;
    args.nIsaLength = isa.GetLength();
    args.pIsa = isa.GetBytes();
    HAXWell::ShaderHandle hShader = HAXWell::CreateShader(args);

    HAXWell::BufferHandle hBuffer = HAXWell::CreateBuffer( 0, 32*nGroups*args.nDispatchThreadCount );

    HAXWell::TimerHandle h = HAXWell::BeginTimer();
    HAXWell::DispatchShader( hShader, &hBuffer,1,nGroups );
    HAXWell::EndTimer( h );
    
    HAXWell::Finish();


    HAXWell::ReleaseBuffer(hBuffer);
    HAXWell::ReleaseShader(hShader);
    return HAXWell::ReadTimer(h);
}
Beispiel #3
0
double ScatterReadTest( size_t nDivergent )
{
    size_t nGroups = 32;
    size_t nThreadsPerGroup=60;
    std::vector<GEN::Instruction> ops;
    ConstructShader(ops,nThreadsPerGroup);

    GEN::Encoder enc;
    HAXWell::Blob isa;
    isa.SetLength( enc.GetBufferSize(ops.size()) );
    isa.SetLength( enc.Encode( isa.GetBytes(), ops.data(), ops.size() ) );

    //PrintISA(stdout,isa);

    int CURBE[HAXWell::MAX_DISPATCH_COUNT][2][8];
    for( size_t i=0; i<HAXWell::MAX_DISPATCH_COUNT; i++ )
    {
        // first CURBE reg is offset into timings buffer
        for( size_t j=0; j<8; j++ )
            CURBE[i][0][j] = 2*i;

        // second CURBE reg is per-channel read offsets for scatter read
        for( size_t j=0; j<8; j++ )
            CURBE[i][1][j] = 8*i + j;

        // send a certain number of them to completely different cache lines
        for( size_t j=0; j<nDivergent; j++ )
            CURBE[i][1][j] += 16*j;

#ifdef SAME_ADDRESS // nah, use zero instead....
        for( size_t j=0; j<8; j++ )
            CURBE[i][1][j] = 0;
#endif
    }


    HAXWell::ShaderArgs args;
    args.nDispatchThreadCount = nThreadsPerGroup;
    args.nSIMDMode = 16;
    args.nCURBEAllocsPerThread = 2;
    args.pCURBE = CURBE;
    args.nIsaLength = isa.GetLength();
    args.pIsa = isa.GetBytes();
    HAXWell::ShaderHandle hShader = HAXWell::CreateShader(args);

    HAXWell::BufferHandle hBuffers[] =
    {
        HAXWell::CreateBuffer( 0, 32*nGroups*args.nDispatchThreadCount ),
        HAXWell::CreateBuffer( 0, 32*nGroups*args.nDispatchThreadCount*1024 ) ,
    };

    HAXWell::DispatchShader( hShader, hBuffers,2,nGroups );
    
    HAXWell::Finish();

    
    unsigned int* pBuff = (unsigned int*)HAXWell::MapBuffer(hBuffers[0]);
  
    size_t nAvg=0;
    for( size_t i=0; i<nThreadsPerGroup*nGroups; i++ )
    {
        unsigned int startlo = pBuff[8*i];
        unsigned int endlo = pBuff[8*i+2];
        nAvg += endlo-startlo;
    }

    HAXWell::UnmapBuffer(hBuffers[0]);


    HAXWell::ReleaseBuffer(hBuffers[0]);
    HAXWell::ReleaseBuffer(hBuffers[1]);
    HAXWell::ReleaseShader(hShader);
  
    return (nAvg)/(double)(nGroups*nThreadsPerGroup);
}