Esempio n. 1
0
bool runTestType(cl::Context context, cl::CommandQueue queue)
{
  cl_uint size = 1024 * 2 + 15;
  
  std::vector<T> input(size);
  
  std::cout << "##Testing scan for " << input.size() << " elements and type " 
	    << magnet::CL::detail::traits<T>::kernel_type();
  
  for(size_t i = 0; i < input.size(); ++i)
    input[i] = i+1;
  
  // create input buffer using pinned memory
  cl::Buffer bufferIn(context, CL_MEM_ALLOC_HOST_PTR |
		      CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, 
		      sizeof(T) * input.size(), &input[0])
    ;
  
  magnet::CL::scan<T> scanFunctor;
  scanFunctor.build(queue, context);
  
  scanFunctor(bufferIn, bufferIn);
  
  std::vector<T> output(size);
  
  queue.enqueueReadBuffer(bufferIn, CL_TRUE, 0, input.size() *
			  sizeof(T), &output[0]);
  bool failed = !testOutput(input, output);

  std::cout << (failed ? " FAILED" : " PASSED") << std::endl; 
  return failed;
}
Esempio n. 2
0
void simulationStep() {
    try {
        // copy
        auto buffer = cl::Buffer(context, CL_MEM_READ_ONLY,
                                 sizeof(unsigned char) * 4 * fieldWidth * fieldHeight,
                                 nullptr, nullptr);
        queue.enqueueWriteBuffer(buffer, CL_TRUE, 0,
                                 sizeof(unsigned char) * 4 * fieldWidth * fieldHeight,
                                 visualizationBufferCPU, NULL, NULL);

        // enque
        stepKernel.setArg(2, buffer);
        cl::NDRange global((size_t) (fieldWidth * fieldHeight));
        queue.enqueueNDRangeKernel(stepKernel, cl::NullRange, global, cl::NullRange);

        // read back
        queue.enqueueReadBuffer(visualizationBufferGPU, CL_TRUE, 0,
                                sizeof(unsigned char) * 4 * fieldWidth * fieldHeight,
                                visualizationBufferCPU, NULL, NULL);

        // finish
        queue.finish();
    } catch (cl::Error err) {
        std::cout << "Error: " << err.what() << "(" << err.err() << ")" << std::endl;
        exit(3);
    }

    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, fieldWidth, fieldHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE,
                 visualizationBufferCPU);
}
Esempio n. 3
0
	void copyFromDevice(cl::CommandQueue &queue)
	{
		if(m_pElts==NULL)
			throw cl::Error(CL_INVALID_MEM_OBJECT, "copyFromDevice - Buffer is not initialised.");
		
		queue.enqueueReadBuffer(m_buffer, CL_TRUE, 0, m_cb, m_pElts);
	}
Esempio n. 4
0
void runTestType(cl::Context context, cl::CommandQueue queue)
{
  cl_uint size = 2 << 10;

  std::vector<T> input(size);

  std::cout << "##Testing bitonic sort for " << input.size() << " elements and type " 
	    << magnet::CL::detail::traits<T>::kernel_type()
	    << std::endl;
  
  for(size_t i = 0; i < input.size(); ++i)
    input[i] = input.size() - i - 1;
  
  // create input buffer using pinned memory
  cl::Buffer bufferIn(context, CL_MEM_ALLOC_HOST_PTR |
		      CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, 
		      sizeof(T) * input.size(), &input[0])
    ;
  
  magnet::CL::bitonicSort<T> bitonicSortFunctor;
  bitonicSortFunctor.build(queue, context);
  bitonicSortFunctor(bufferIn);

  std::vector<T> output(size);
 
  queue.enqueueReadBuffer(bufferIn, CL_TRUE, 0, input.size() *
			  sizeof(T), &output[0]);

  if (!testOutput(input, output))
    M_throw() << "Incorrect output for size " 
	      << input.size()
	      << " and type "
	      << magnet::CL::detail::traits<T>::kernel_type();
}
Esempio n. 5
0
 void read(const cl::CommandQueue &q, size_t offset, size_t size, T *host,
         bool blocking = false) const
 {
     if (size)
         q.enqueueReadBuffer(
                 buffer, blocking ? CL_TRUE : CL_FALSE,
                 sizeof(T) * offset, sizeof(T) * size, host
                 );
 }
Esempio n. 6
0
	cl::Event copyFromDeviceAsync(cl::CommandQueue &queue)
	{
		if(m_pElts==NULL)
			throw cl::Error(CL_INVALID_MEM_OBJECT, "copyFromDevice - Buffer is not initialised.");
		
		cl::Event complete;
		
		queue.enqueueReadBuffer(m_buffer, CL_FALSE, 0, m_cb, m_pElts, NULL, &complete);
		return complete;
	}
Esempio n. 7
0
	cl::Event copyFromDeviceAsync(cl::CommandQueue &queue, cl::Event &prior)
	{
		if(m_pElts==NULL)
			throw cl::Error(CL_INVALID_MEM_OBJECT, "copyFromDevice - Buffer is not initialised.");
		
		cl::Event complete;
		
		std::vector<cl::Event> srcs;
		srcs.push_back(prior);
		queue.enqueueReadBuffer(m_buffer, CL_FALSE, 0, m_cb, m_pElts, &srcs, &complete);
		return complete;
	}
void helper(uint32_t* out, int osize, uint8_t* in, int isize, int w, int h, int choice)
{
	int set_size=8;
    try {
        cl::Buffer bufferIn = cl::Buffer(gContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                isize*sizeof(cl_uchar), in, NULL);
        cl::Buffer bufferOut = cl::Buffer(gContext, CL_MEM_READ_WRITE, osize*sizeof(cl_uchar4));
        cl::Buffer bufferOut2= cl::Buffer(gContext, CL_MEM_READ_WRITE, osize*sizeof(cl_uchar4));
        gNV21Kernel.setArg(2,w);
        gNV21Kernel.setArg(3,h);
        gNV21Kernel.setArg(1,bufferIn);
        gNV21Kernel.setArg(0,bufferOut);
        gQueue.enqueueNDRangeKernel(gNV21Kernel,
                cl::NullRange,
                cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16),
                cl::NDRange(set_size,set_size),
                NULL,
                NULL);
        if (choice==1) {
            gLaplacianK.setArg(2,w);
            gLaplacianK.setArg(3,h);
            gLaplacianK.setArg(1,bufferOut);
            gLaplacianK.setArg(0,bufferOut2);
            gQueue.enqueueNDRangeKernel(gLaplacianK,
                    cl::NullRange,
                    cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16),
                    cl::NDRange(set_size,set_size),
                    NULL,
                    NULL);
        }
        else if (choice>1) {
        	gNegative.setArg(2,w);
        	gNegative.setArg(3,h);
        	gNegative.setArg(1,bufferOut);
        	gNegative.setArg(0,bufferOut2);
        	gQueue.enqueueNDRangeKernel(gNegative,
        	                    cl::NullRange,
        	                    cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16),
        	                    cl::NDRange(set_size,set_size),
        	                    NULL,
        	                    NULL);

        }

        gQueue.enqueueReadBuffer(bufferOut2, CL_TRUE, 0, osize*sizeof(cl_uchar4), out);
    }
    catch (cl::Error e) {
        LOGI("@oclDecoder: %s %d \n",e.what(),e.err());
    }
}
Esempio n. 9
0
/**
 * generate 64 bit unsigned random numbers in device global memory
 *@param tinymt_status device global memories
 *@param total_num total number of work items
 *@param local_num number of local work items
 *@param data_size number of data to generate
 */
static void generate_uint64(Buffer& tinymt_status,
                            int total_num,
                            int local_num,
                            int data_size)
{
#if defined(DEBUG)
    cout << "generate_uint64 start" << endl;
#endif
    int min_size = total_num;
    if (data_size % min_size != 0) {
        data_size = (data_size / min_size + 1) * min_size;
    }
    Kernel uint_kernel(program, "tinymt_uint64_kernel");
    Buffer output_buffer(context,
                         CL_MEM_READ_WRITE,
                         data_size * sizeof(uint64_t));
    uint_kernel.setArg(0, tinymt_status);
    uint_kernel.setArg(1, output_buffer);
    uint_kernel.setArg(2, data_size / total_num);
    NDRange global(total_num);
    NDRange local(local_num);
    Event generate_event;
#if defined(DEBUG)
    cout << "generate_uint64 enque kernel start" << endl;
#endif
    queue.enqueueNDRangeKernel(uint_kernel,
                               NullRange,
                               global,
                               local,
                               NULL,
                               &generate_event);
    uint64_t * output = new uint64_t[data_size];
    generate_event.wait();
    queue.enqueueReadBuffer(output_buffer,
                            CL_TRUE,
                            0,
                            data_size * sizeof(uint64_t),
                            output);
    check_data(output, data_size, total_num);
#if defined(DEBUG)
    print_uint64(output, data_size, total_num);
#endif
    double time = get_time(generate_event);
    cout << "generate time:" << time * 1000 << "ms" << endl;
    delete[] output;
#if defined(DEBUG)
    cout << "generate_uint64 end" << endl;
#endif
}
Esempio n. 10
0
void CLArgument::copyFromDevice( cl::CommandQueue &queue )
{
	assert( myBufferInitialized );

	// If we own the memory, nobody else can read it anyways.
	if ( myCopy )
		return;
	if ( !myCopyBack )
		return;

	queue.enqueueReadBuffer(
		myBuffer,
		CL_TRUE,
		0,
		mySize,
		myPtr
	);
}
Esempio n. 11
0
real L2Norm(const Buffer3D & in,cl::CommandQueue & q)
{
	cl::Buffer ans (CLContextLoader::getContext(),CL_MEM_READ_WRITE,sizeof(real)*in.width()*in.height()*in.depth());

	CLContextLoader::getRedL2NormKer().setArg(0,in());
	CLContextLoader::getRedL2NormKer().setArg(1,ans());

	q.enqueueNDRangeKernel(CLContextLoader::getRedL2NormKer(),
					cl::NDRange(0),
					cl::NDRange(in.width()*in.height()*in.depth()),
					getBestWorkspaceDim(cl::NDRange(in.width()*in.height()*in.depth())));

	ans = performReduction(ans,CLContextLoader::getRedSumAllKer(),q,in.width()*in.height()*in.depth());

	real res;
	q.enqueueReadBuffer(ans,true,0,sizeof(real),&res);
	return sqrt(res);
}
Esempio n. 12
0
/**
 * initialize tinymt status in device global memory
 * using 1 parameter for 1 generator.
 *@param tinymt_status internal state of kernel side tinymt
 *@param total total number of work items
 *@param local_item number of local work items
 *@param seed seed for initialization
 */
static void initialize_by_seed(Buffer& tinymt_status,
                               int total,
                               int local_item,
                               uint32_t seed)
{
#if defined(DEBUG)
    cout << "initialize_by_seed start" << endl;
#endif
    Kernel init_kernel(program, "tinymt_init_seed_kernel");
    init_kernel.setArg(0, tinymt_status);
    init_kernel.setArg(1, seed);
    NDRange global(total);
    NDRange local(local_item);
    Event event;
#if defined(DEBUG)
    cout << "global:" << dec << total << endl;
    cout << "group:" << dec << (total / local_item) << endl;
    cout << "local:" << dec << local_item << endl;
#endif
    queue.enqueueNDRangeKernel(init_kernel,
                               NullRange,
                               global,
                               local,
                               NULL,
                               &event);
    double time = get_time(event);
    tinymt32j_t status[total];
    queue.enqueueReadBuffer(tinymt_status,
                            CL_TRUE,
                            0,
                            sizeof(tinymt32j_t) * total,
                            status);
    cout << "initializing time = " << time * 1000 << "ms" << endl;
#if defined(DEBUG)
    cout << "status[0].s0:" << hex << status[0].s0 << endl;
    cout << "status[0].s1:" << hex << status[0].s1 << endl;
    cout << "status[0].s2:" << hex << status[0].s2 << endl;
    cout << "status[0].s3:" << hex << status[0].s3 << endl;
#endif
    check_status(status, total);
#if defined(DEBUG)
    cout << "initialize_by_seed end" << endl;
#endif
}
Esempio n. 13
0
/**
 * initialize tinymt status in device global memory
 * using 1 parameter for all generators.
 *@param tinymt_status device global memories
 *@param total total number of work items
 *@param local_item number of local work items
 *@param seed_array seeds for initialization
 *@param seed_size size of seed_array
 */
static void initialize_by_array(Buffer& tinymt_status,
                                int total,
                                int local_item,
                                uint64_t seed_array[],
                                int seed_size)
{
#if defined(DEBUG)
    cout << "initialize_by_array start" << endl;
#endif
    Buffer seed_array_buffer(context,
                             CL_MEM_READ_WRITE,
                             seed_size * sizeof(uint64_t));
    queue.enqueueWriteBuffer(seed_array_buffer,
                             CL_TRUE,
                             0,
                             seed_size * sizeof(uint64_t),
                             seed_array);
    Kernel init_kernel(program, "tinymt_init_array_kernel");
    init_kernel.setArg(0, tinymt_status);
    init_kernel.setArg(1, seed_array_buffer);
    init_kernel.setArg(2, seed_size);
    NDRange global(total);
    NDRange local(local_item);
    Event event;
    queue.enqueueNDRangeKernel(init_kernel,
                               NullRange,
                               global,
                               local,
                               NULL,
                               &event);
    double time = get_time(event);
    tinymt64j_t status[total];
    queue.enqueueReadBuffer(tinymt_status,
                            CL_TRUE,
                            0,
                            sizeof(tinymt64j_t) * total,
                            status);
    cout << "initializing time = " << time * 1000 << "ms" << endl;
    check_status(status, total);
#if defined(DEBUG)
    cout << "initialize_by_array end" << endl;
#endif
}
Esempio n. 14
0
    void run(T* buf)
    {
        cl_int err;
        cl::Buffer outbuf(
            m_context,
            CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
            N*N*sizeof(T),
            buf,
            &err);
        checkErr(err, "Buffer::Buffer()");

        err = m_kernel.setArg(0, outbuf);
        checkErr(err, "Kernel::setArg(0)");

        err = m_kernel.setArg(1, N);
        checkErr(err, "Kernel::setArg(1)");

        err = m_kernel.setArg(2, depth);
        checkErr(err, "Kernel::setArg(2)");

        err = m_kernel.setArg(3, escape2);
        checkErr(err, "Kernel::setArg(3)");

        cl::Event event;
        err = m_cmdq.enqueueNDRangeKernel(
            m_kernel,
            cl::NullRange,
            cl::NDRange(N*N),
            cl::NDRange(N, 1),
            NULL,
            &event);
        checkErr(err, "ComamndQueue::enqueueNDRangeKernel()");

        event.wait();
        err = m_cmdq.enqueueReadBuffer(
            outbuf,
            CL_TRUE,
            0,
            N*N*sizeof(T),
            buf);
        checkErr(err, "ComamndQueue::enqueueReadBuffer()");
    }
Esempio n. 15
0
/**
 * generate double precision floating point numbers in the range [0, 1)
 * in device global memory
 *@param tinymt_status device global memories
 *@param total_num total number of work items
 *@param local_num number of local work items
 *@param data_size number of data to generate
 */
static void generate_double01(Buffer& tinymt_status,
                              int total_num,
                              int local_num,
                              int data_size)
{
    int min_size = total_num;
    if (data_size % min_size != 0) {
        data_size = (data_size / min_size + 1) * min_size;
    }
    Kernel double_kernel(program, "tinymt_double01_kernel");
    Buffer output_buffer(context,
                         CL_MEM_READ_WRITE,
                         data_size * sizeof(double));
    double_kernel.setArg(0, tinymt_status);
    double_kernel.setArg(1, output_buffer);
    double_kernel.setArg(2, data_size / total_num);
    NDRange global(total_num);
    NDRange local(local_num);
    Event generate_event;
    queue.enqueueNDRangeKernel(double_kernel,
                               NullRange,
                               global,
                               local,
                               NULL,
                               &generate_event);
    double * output = new double[data_size];
    generate_event.wait();
    queue.enqueueReadBuffer(output_buffer,
                            CL_TRUE,
                            0,
                            data_size * sizeof(double),
                            &output[0]);
    check_data01(output, data_size, total_num);
#if defined(DEBUG)
    print_double(&output[0], data_size, local_num);
#endif
    double time = get_time(generate_event);
    delete[] output;
    cout << "generate time:" << time * 1000 << "ms" << endl;
}
Esempio n. 16
0
void MetaBallsApp::updateMarching()
{
    static const cl_int3 size{ VOLUME_WIDTH, VOLUME_HEIGHT, VOLUME_DEPTH };
    mClCommandQueue.enqueueNDRangeKernel( mKernWriteClear,
                                          cl::NullRange,
                                          cl::NDRange( VOLUME_SIZE ) );
    /* Update volumes */
    mClCommandQueue.enqueueNDRangeKernel( mKernWriteMetaballs,
                                          cl::NullRange,
                                          cl::NDRange(  VOLUME_SIZE ) );
    /* End */
    int zero = 0;
    auto kernelRange = (size.s[0]-1) * (size.s[1]-1) * (size.s[2]-1);
    mClCommandQueue.enqueueWriteBuffer( mClVertIndex, true, 0, sizeof(int), &zero );
    mClCommandQueue.enqueueNDRangeKernel( mKernConstructSurface,
                                          cl::NullRange,
                                          cl::NDRange( kernelRange ) );
    mClCommandQueue.enqueueReadBuffer( mClVertIndex,
                                       true, 0, sizeof(cl_int),
                                       &mMarchingVertsWritten );

    /* Generate Normals */
    if (mMarchingVertsWritten > 0) {
        bool smooth = true;
        if( ! smooth )
            mClCommandQueue.enqueueNDRangeKernel( mKernGenNormals,
                                                  cl::NullRange,
                                                  cl::NDRange( mMarchingVertsWritten ) );
        else
            mClCommandQueue.enqueueNDRangeKernel( mKernGenNormalsSmooth,
                                                  cl::NullRange,
                                                  cl::NDRange( mMarchingVertsWritten ) );
    }

    //if( mDebugDraw )
    mClCommandQueue.enqueueNDRangeKernel( mKernWritePointColorBack,
                                          cl::NullRange,
                                          cl::NDRange( VOLUME_SIZE ) );
}
void Reduce::enqueue(
    const cl::CommandQueue &commandQueue,
    bool blocking,
    const cl::Buffer &inBuffer,
    void *out,
    ::size_t first,
    ::size_t elements,
    const VECTOR_CLASS<cl::Event> *events,
    cl::Event *event, cl::Event *reduceEvent)
{
    VECTOR_CLASS<cl::Event> reduceEvents(1);
    cl::Event readEvent;
    
    if (out == NULL)
        throw cl::Error(CL_INVALID_VALUE, "clogs::Reduce::enqueue: out is NULL");

    //{inviwo::ScopedClockCPU clock("Reduce", "Reduce", -0.1f);
    enqueue(commandQueue, inBuffer, sums, first, elements, reduceBlocks, events, &reduceEvents[0]);
    //auto buf = commandQueue.enqueueMapBuffer(sums, true, CL_MAP_READ, reduceBlocks * elementSize, elementSize, &reduceEvents,
    //    &readEvent);
    //memcpy(buf, out, elementSize);
    //commandQueue.enqueueUnmapMemObject(sums, buf);
    commandQueue.enqueueReadBuffer(
        sums, blocking,
        reduceBlocks * elementSize,
        elementSize,
        out,
        &reduceEvents,
        &readEvent);
    //}
    doEventCallback(readEvent);
    if (event != NULL)
        *event = readEvent;
    if (reduceEvent != NULL)
        *reduceEvent = reduceEvents[0];
}
Esempio n. 18
0
bool runTestType(cl::Context context, cl::CommandQueue queue)
{
  cl_uint size = (1 << 16) + 16384;

  std::vector<T> input(size);

  std::cout << "##Testing AMD radix sort for " << input.size() << " elements and type " 
	    << magnet::CL::detail::traits<T>::kernel_type();
  
  for(size_t i = 0; i < input.size(); ++i)
    input[i] = input.size() - i - 1;
  
  // create input buffer using pinned memory
  cl::Buffer bufferIn(context, CL_MEM_ALLOC_HOST_PTR |
		      CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, 
		      sizeof(T) * input.size(), &input[0]);
  
  magnet::CL::radixSortAMD<T> radixSortFunctor;
  radixSortFunctor.build(queue, context);
  radixSortFunctor(bufferIn, bufferIn);

  std::vector<T> output(size);
 
  queue.enqueueReadBuffer(bufferIn, CL_TRUE, 0, input.size() *
			  sizeof(T), &output[0]);

  bool failed = !testOutput(input, output);

  std::cout << " key(only) " << (failed ? "FAILED" : "PASSED") << ", "; 

  //Now test with some data!
  //Refresh the input array
  queue.enqueueWriteBuffer(bufferIn, CL_TRUE, 0, input.size() *
			   sizeof(T), &input[0]);

  //Write a data array
  std::vector<cl_uint> data(size);
  for(size_t i = 0; i < input.size(); ++i)
    data[i] = i;

  cl::Buffer dataIn(context, CL_MEM_ALLOC_HOST_PTR |
		    CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, 
		    sizeof(cl_uint) * data.size(), &data[0])
    ;

  radixSortFunctor(bufferIn, dataIn, bufferIn, dataIn);
  
  queue.enqueueReadBuffer(dataIn, CL_TRUE, 0, data.size() *
			  sizeof(cl_uint), &data[0]);

  bool keyfail = !testOutput(input, output);

  std::cout << " key " << (keyfail ? "FAILED" : "PASSED"); 

  bool datafail = false;
  for(size_t i = 0; i < input.size(); ++i)
    if (data[i] != input.size() - 1 - i)
      datafail = true;

  std::cout << " data " << (datafail ? "FAILED" : "PASSED") << std::endl;
  
  return failed || keyfail || datafail;
}
Esempio n. 19
0
int clPeak::runTransferBandwidthTest(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo)
{
    if(!isTransferBW)
        return 0;

    float timed, gbps;
    cl::NDRange globalSize, localSize;
    cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>();
    int iters = devInfo.transferBWIters;
    Timer timer;

    cl_uint maxItems = devInfo.maxAllocSize / sizeof(float) / 2;
    cl_uint numItems;

    // Set an upper-limit for cpu devies
    if(devInfo.deviceType & CL_DEVICE_TYPE_CPU) {
        numItems = roundToPowOf2(maxItems, 26);
    } else {
        numItems = roundToPowOf2(maxItems);
    }

    float *arr = new float[numItems];

    try
    {
        cl::Buffer clBuffer = cl::Buffer(ctx, (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), (numItems * sizeof(float)));

        cout << NEWLINE TAB TAB "Transfer bandwidth (GBPS)" << endl;
        cout << setprecision(2) << fixed;

        ///////////////////////////////////////////////////////////////////////////
        // enqueueWriteBuffer
        cout << TAB TAB TAB "enqueueWriteBuffer         : ";    cout.flush();

        // Dummy warm-up
        queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr);
        queue.finish();

        timed = 0;

        if(useEventTimer)
        {
            for(int i=0; i<iters; i++)
            {
                cl::Event timeEvent;
                queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent);
                queue.finish();
                timed += timeInUS(timeEvent);
            }
        } else
        {
            Timer timer;

            timer.start();
            for(int i=0; i<iters; i++)
            {
                queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr);
            }
            queue.finish();
            timed = timer.stopAndTime();
        }
        timed /= iters;

        gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
        cout << gbps << endl;
        ///////////////////////////////////////////////////////////////////////////
        // enqueueReadBuffer
        cout << TAB TAB TAB "enqueueReadBuffer          : ";    cout.flush();

        // Dummy warm-up
        queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr);
        queue.finish();

        timed = 0;
        if(useEventTimer)
        {
            for(int i=0; i<iters; i++)
            {
                cl::Event timeEvent;
                queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent);
                queue.finish();
                timed += timeInUS(timeEvent);
            }
        } else
        {
            Timer timer;

            timer.start();
            for(int i=0; i<iters; i++)
            {
                queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr);
            }
            queue.finish();
            timed = timer.stopAndTime();
        }
        timed /= iters;

        gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
        cout << gbps << endl;
        ///////////////////////////////////////////////////////////////////////////
        // enqueueMapBuffer
        cout << TAB TAB TAB "enqueueMapBuffer(for read) : ";    cout.flush();

        queue.finish();

        timed = 0;
        if(useEventTimer)
        {
            for(int i=0; i<iters; i++)
            {
                cl::Event timeEvent;
                void *mapPtr;

                mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float)), NULL, &timeEvent);
                queue.finish();
                queue.enqueueUnmapMemObject(clBuffer, mapPtr);
                queue.finish();
                timed += timeInUS(timeEvent);
            }
        } else
        {
            for(int i=0; i<iters; i++)
            {
                Timer timer;
                void *mapPtr;

                timer.start();
                mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float)));
                queue.finish();
                timed += timer.stopAndTime();

                queue.enqueueUnmapMemObject(clBuffer, mapPtr);
                queue.finish();
            }
        }
        timed /= iters;

        gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
        cout << gbps << endl;
        ///////////////////////////////////////////////////////////////////////////

        // memcpy from mapped ptr
        cout << TAB TAB TAB TAB "memcpy from mapped ptr   : ";  cout.flush();
        queue.finish();

        timed = 0;
        for(int i=0; i<iters; i++)
        {
            cl::Event timeEvent;
            void *mapPtr;

            mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float)));
            queue.finish();

            timer.start();
            memcpy(arr, mapPtr, (numItems * sizeof(float)));
            timed += timer.stopAndTime();

            queue.enqueueUnmapMemObject(clBuffer, mapPtr);
            queue.finish();
        }
        timed /= iters;

        gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
        cout << gbps << endl;

        ///////////////////////////////////////////////////////////////////////////

        // enqueueUnmap
        cout << TAB TAB TAB "enqueueUnmap(after write)  : ";    cout.flush();

        queue.finish();

        timed = 0;
        if(useEventTimer)
        {
            for(int i=0; i<iters; i++)
            {
                cl::Event timeEvent;
                void *mapPtr;

                mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float)));
                queue.finish();
                queue.enqueueUnmapMemObject(clBuffer, mapPtr, NULL, &timeEvent);
                queue.finish();
                timed += timeInUS(timeEvent);
            }
        } else
        {
            for(int i=0; i<iters; i++)
            {
                Timer timer;
                void *mapPtr;

                mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float)));
                queue.finish();

                timer.start();
                queue.enqueueUnmapMemObject(clBuffer, mapPtr);
                queue.finish();
                timed += timer.stopAndTime();
            }
        }
        timed /= iters;
        gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;

        cout << gbps << endl;
        ///////////////////////////////////////////////////////////////////////////

        // memcpy to mapped ptr
        cout << TAB TAB TAB TAB "memcpy to mapped ptr     : ";  cout.flush();
        queue.finish();

        timed = 0;
        for(int i=0; i<iters; i++)
        {
            cl::Event timeEvent;
            void *mapPtr;

            mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float)));
            queue.finish();

            timer.start();
            memcpy(mapPtr, arr, (numItems * sizeof(float)));
            timed += timer.stopAndTime();

            queue.enqueueUnmapMemObject(clBuffer, mapPtr);
            queue.finish();
        }
        timed /= iters;

        gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
        cout << gbps << endl;

        ///////////////////////////////////////////////////////////////////////////


    }
    catch(cl::Error error)
    {
        cerr << error.what() << "(" << error.err() << ")" << endl;
        cerr << TAB TAB TAB "Tests skipped" << endl;

        if(arr)     delete [] arr;
        return -1;
    }

    if(arr)     delete [] arr;
    return 0;
}
Esempio n. 20
0
void helper(uint8_t* in, int isize, int choice[])
{
	int set_NDRange_size=16;
	char* filePathptr;
	int result = 0;

	//opening file and closing. Appended to in later locations
	 FILE *log = fopen("logfile.txt", "w");
	    if (!log) {
	        printf("\nCannot open logfile.txt for writing.\n");
	        return;   // bail out if we can't log
	    }



    try {
        cl::Buffer bufferIn = cl::Buffer(gContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                isize*sizeof(cl_uchar), in, NULL);
        cl::Buffer bufferOut = cl::Buffer(gContext, CL_MEM_READ_WRITE, isize*sizeof(cl_uchar4));

    	//int result = mainRunBFS(char * argv[]); // need a c string to pass it

    	/*
        gLaplacianK.setArg(2,w);
        gLaplacianK.setArg(3,h);
        gLaplacianK.setArg(1,bufferOut);
        gLaplacianK.setArg(0,bufferOut2);
        gQueue.enqueueNDRangeKernel(gLaplacianK,
                cl::NullRange,
                cl::NDRange( (int)ceil((float)w/16.0f)*16,(int)ceil((float)h/16.0f)*16),
                cl::NDRange(set_NDRange_size,set_NDRange_size),
                NULL,
                NULL);
        */

        if (choice[0]==1) { //leukocyte


        }
        if (choice[1]==1){ //heartwall

        }

        if (choice[2]==1){ //CFD

        }

        if (choice[3]==1){ //LUdecomp

        }

        if (choice[4]==1){ //hotspot

        }

        if (choice[5]==1){ //backprop

        }

        if (choice[6]==1){ //needleman

        }

        if (choice[7]==1){ //kmeans

        }

        if (choice[8]==1){ //BFS
        	fclose(log);
        	char filePath[] = "../assets/graph4096.txt";
        	filePathptr = &filePath[0];
        	result = mainRunBFS(&filePathptr);
        	if (result < 0) {
        		FILE *log = fopen("logfile.txt", "a");
        		fprintf(log, "\n\n----------\nError in running mainRunBFS\n----------\n\n");
        		fclose(log);
        	}
        }

        if (choice[9]==1){ //srad

        }

        if (choice[10]==1){ //streamcluster

        }

        if (choice[11]==1){ //particle filter

        }

        if (choice[12]==1){ //pathfinder

        }

        if (choice[13]==1){ //gaussian

        }

        if (choice[14]==1){ //k nearest

        }

        if (choice[15]==1){ //lava

        }

        if (choice[16]==1){ //myocyte

        }

        if (choice[17]==1){ //btree

        }

        if (choice[18]==1){ //gpudwt

        }

        if (choice[19]==1){ //hybrid sort

        }

        //last arg in this should be out???
        gQueue.enqueueReadBuffer(bufferOut, CL_TRUE, 0, isize*sizeof(cl_uchar4), NULL);

    }
    catch (cl::Error e) {
        LOGI("@oclDecoder: %s %d \n",e.what(),e.err());
    }
    if(log) {
    	fclose(log);
    }
}
void watershed(int width, int height,
               cl::Buffer& src,
               cl::Buffer& labeled,
               ProgramCache& cache,
               cl::CommandQueue& queue)
{

#ifdef OPENCL_PROFILE
    watershed_descent_kernel_time = 0;
    watershed_increment_kernel_time = 0;
    watershed_minima_kernel_time = 0;
    watershed_plateau_kernel_time = 0;
    watershed_flood_kernel_time = 0;
#endif

    cl::Context context = queue.getInfo<CL_QUEUE_CONTEXT>();

    std::stringstream params_stream;
    params_stream << "-DBLOCK_SIZE=";
    params_stream << BLOCK_SIZE;

    std::string program_params = params_stream.str();

    cl::Program& program = cache.getProgram("Watershed", program_params);
    
    cl::Kernel descent_kernel(program, "descent_kernel");
    cl::Kernel increment_kernel(program, "increment_kernel");
    cl::Kernel minima_kernel(program, "minima_kernel");
    cl::Kernel plateau_kernel(program, "plateau_kernel");
    cl::Kernel flood_kernel(program, "flood_kernel");

    //setting constant memory with neigbourhood
    cl::Buffer cl_neighbourhood_x = cl::Buffer(context,CL_MEM_READ_ONLY,
                                               sizeof(neighbourhood_x));
    cl::Buffer cl_neighbourhood_y = cl::Buffer(context, CL_MEM_READ_ONLY,
                                               sizeof(neighbourhood_y));

#ifdef OPENCL_PROFILE
    cl::Event first_event;
    queue.enqueueWriteBuffer(cl_neighbourhood_x, CL_TRUE, 0,
                             sizeof(neighbourhood_x),
                             neighbourhood_x, __null, &first_event);
#else
    queue.enqueueWriteBuffer(cl_neighbourhood_x, CL_TRUE, 0,
                             sizeof(neighbourhood_x), neighbourhood_x);
#endif

    queue.enqueueWriteBuffer(cl_neighbourhood_y, CL_TRUE, 0,
                             sizeof(neighbourhood_y), neighbourhood_y);

    //const size_t block_size = 6;
    //cl::LocalSpaceArg local_mem = cl::__local(block_size * block_size * sizeof(float));
    cl::LocalSpaceArg local_mem = cl::__local(BLOCK_SIZE * BLOCK_SIZE * sizeof(float));

    //setting args for descent_kernel
    descent_kernel.setArg(0, src);
    descent_kernel.setArg(1, labeled);
    descent_kernel.setArg(2, cl_neighbourhood_x);
    descent_kernel.setArg(3, cl_neighbourhood_y);
    descent_kernel.setArg(4, local_mem);
    descent_kernel.setArg(5, width);
    descent_kernel.setArg(6, height);

    size_t global_width = (width / (BLOCK_SIZE - 2) + 1) * BLOCK_SIZE;
    size_t global_height = (height / (BLOCK_SIZE - 2) + 1) * BLOCK_SIZE;

#ifdef DEBUG_PRINT
    std::cout << "global width=" << global_width
              << " global height=" << global_height << std::endl;
#endif

    cl::NDRange global(global_width, global_height);
    cl::NDRange local(BLOCK_SIZE, BLOCK_SIZE);

    cl_int status;

#ifdef OPENCL_PROFILE
    {
        VECTOR_CLASS<cl::Event> events_vector(1);
        status = queue.enqueueNDRangeKernel(descent_kernel, cl::NullRange,
                                            global, local, __null,
                                            &events_vector[0]);

        cl::WaitForEvents(events_vector);

        cl::Event& event = events_vector[0];

        cl_ulong start = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
        cl_ulong end = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
        cl_ulong total_time = end - start;

        watershed_descent_kernel_time = total_time;
    }
#else
    status = queue.enqueueNDRangeKernel(descent_kernel, cl::NullRange,
                                        global, local);
#endif

#ifdef DEBUG_PRINT
    std::cout << "kernel execution " << status << std::endl;
#endif

   // queue.flush();
   // queue.enqueueBarrier();

    /* PREPARING INCREMENT KERNEL */

    increment_kernel.setArg(0, labeled);
    increment_kernel.setArg(1, width);
    increment_kernel.setArg(2, height);

#ifdef OPENCL_PROFILE
    {
        VECTOR_CLASS<cl::Event> events_vector(1);
        status = queue.enqueueNDRangeKernel(increment_kernel, cl::NullRange,
                                            global, local, __null,
                                            &events_vector[0]);

        cl::WaitForEvents(events_vector);

        cl::Event& event = events_vector[0];

        cl_ulong start = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
        cl_ulong end = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
        cl_ulong total_time = end - start;

        watershed_increment_kernel_time = total_time;
    }
#else
    status = queue.enqueueNDRangeKernel(increment_kernel, cl::NullRange,
                                        global, local);
#endif

//    queue.enqueueBarrier();


    /* PREPARING MINIMA KERNEL */

    int counter_tmp = 0;
    cl::Buffer counter(context, CL_MEM_READ_WRITE, sizeof(int));
    queue.enqueueWriteBuffer(counter, CL_TRUE, 0, sizeof(int), &counter_tmp);

    queue.enqueueBarrier();

    minima_kernel.setArg(0, counter);
    minima_kernel.setArg(1, labeled);
    minima_kernel.setArg(2, cl_neighbourhood_x);
    minima_kernel.setArg(3, cl_neighbourhood_y);
    minima_kernel.setArg(4, local_mem);
    minima_kernel.setArg(5, width);
    minima_kernel.setArg(6, height);

    int old_val = -1;
    int new_val = -2;
    int c = 0;


    while(old_val != new_val)
    {
        old_val = new_val;

#ifdef OPENCL_PROFILE
        {
            VECTOR_CLASS<cl::Event> events_vector(1);
            status = queue.enqueueNDRangeKernel(minima_kernel, cl::NullRange,
                                                global, local, __null,
                                                &events_vector[0]);

            cl::WaitForEvents(events_vector);
            cl::Event& event = events_vector[0];
            cl_ulong start = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
            cl_ulong end = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
            cl_ulong total_time = end - start;

            watershed_minima_kernel_time += total_time;
        }
#else
        status = queue.enqueueNDRangeKernel(minima_kernel, cl::NullRange,
                                            global, local);
#endif
        queue.enqueueReadBuffer(counter, CL_TRUE, 0, sizeof(int), &new_val);
        c++;
    }

#ifdef DEBUG_PRINT
    std::cout << "step 2: " << c << " iterations" << std::endl;
#endif

    /* PREPARING PLATEAU KERNEL */

    queue.enqueueWriteBuffer(counter, CL_TRUE, 0, sizeof(int), &counter_tmp);

    queue.enqueueBarrier();

    plateau_kernel.setArg(0, counter);
    plateau_kernel.setArg(1, src);
    plateau_kernel.setArg(2, labeled);
    plateau_kernel.setArg(3, cl_neighbourhood_x);
    plateau_kernel.setArg(4, cl_neighbourhood_y);
    plateau_kernel.setArg(5, local_mem);
    plateau_kernel.setArg(6, width);
    plateau_kernel.setArg(7, height);

    old_val = -1;
    new_val = -2;
    c = 0;

#ifdef OPENCL_PROFILE
    watershed_plateau_kernel_time = 0;
#endif

    while(old_val != new_val)
    {
        old_val = new_val;

#ifdef OPENCL_PROFILE
        {
            VECTOR_CLASS<cl::Event> events_vector(1);
            status = queue.enqueueNDRangeKernel(plateau_kernel, cl::NullRange,
                                                global, local, __null,
                                                &events_vector[0]);

            cl::WaitForEvents(events_vector);
            cl::Event& event = events_vector[0];
            cl_ulong start = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
            cl_ulong end = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
            cl_ulong total_time = end - start;

            watershed_plateau_kernel_time += total_time;
        }
#else
        status = queue.enqueueNDRangeKernel(plateau_kernel, cl::NullRange,
                                            global, local);
#endif
        queue.enqueueReadBuffer(counter, CL_TRUE, 0, sizeof(int), &new_val);
        c++;
    }

#ifdef DEBUG_PRINT
    std::cout << "step 3: " << c << " iterations" << std::endl;
#endif

    //preparing flood kernel

    queue.enqueueWriteBuffer(counter, CL_TRUE, 0, sizeof(int), &counter_tmp);
    queue.enqueueBarrier();

    flood_kernel.setArg(0, counter);
    flood_kernel.setArg(1, labeled);
    flood_kernel.setArg(2, width);
    flood_kernel.setArg(3, height);

    old_val = -1;
    new_val = -2;
    c = 0;

    int new_block_size = 16;
    local = cl::NDRange(new_block_size, new_block_size);

    int n_width = ((width - 1) / new_block_size + 2) * new_block_size;
    int n_height = ((height - 1) / new_block_size + 2) * new_block_size;

    global = cl::NDRange(n_width, n_height);

#ifdef DEBUG_PRINT
    std::cout << "flood kernel invocation params:" << std::endl;
    std::cout << "local: " << local[0] << ", " << local[1] << std::endl;
    std::cout << "global: " << global[0] << ", " << global[1] << std::endl;
#endif

#ifdef OPENCL_PROFILE
    cl::Event last_event;
#endif


#ifdef OPENCL_PROFILE
    watershed_flood_kernel_time = 0;
#endif

    while(old_val != new_val)
    {
        old_val = new_val;

#ifdef OPENCL_PROFILE
        {
            VECTOR_CLASS<cl::Event> events_vector(1);
            status = queue.enqueueNDRangeKernel(flood_kernel, cl::NullRange,
                                                global, local, __null,
                                                &events_vector[0]);

            cl::WaitForEvents(events_vector);
            cl::Event& event = events_vector[0];
            cl_ulong start = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
            cl_ulong end = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
            cl_ulong total_time = end - start;

            watershed_flood_kernel_time += total_time;
        }
#else
        status = queue.enqueueNDRangeKernel(flood_kernel, cl::NullRange,
                                            global, local);
#endif

#ifdef OPENCL_PROFILE
        queue.enqueueReadBuffer(counter, CL_TRUE, 0, sizeof(int),
                                &new_val, __null, &last_event);
#else
        queue.enqueueReadBuffer(counter, CL_TRUE, 0, sizeof(int), &new_val);
#endif
        c++;
    }

#ifdef OPENCL_PROFILE
    watershed_descent_kernel_time /= TIME_DIVISOR;
    watershed_increment_kernel_time /= TIME_DIVISOR;
    watershed_minima_kernel_time /= TIME_DIVISOR;
    watershed_plateau_kernel_time /= TIME_DIVISOR;
    watershed_flood_kernel_time /= TIME_DIVISOR;
#endif

#ifdef DEBUG_PRINT
    std::cout << "step 4: " << c << " iterations" << std::endl;
#endif

#ifdef OPENCL_PROFILE
    last_event.wait();

    cl_ulong start = first_event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
    cl_ulong end = last_event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
    cl_ulong total_time = end - start;

    setLastExecutionTime(total_time/TIME_DIVISOR);
#endif
}
inline void OpenCL::readarg(std::vector<T> & arg, cl::Buffer &buf,cl::CommandQueue &quene) const{
    quene.enqueueReadBuffer(buf,CL_FALSE,0,arg.size()*sizeof(T),&(arg[0]));
}
inline void OpenCL::readarg(T (&arg)[N], cl::Buffer &buf,cl::CommandQueue &quene) const{
    quene.enqueueReadBuffer(buf,CL_FALSE,0,N*sizeof(T),arg);
}
int MaxValueSimple::maxValueCL(int* values, size_t len) {
	try {
		cl_int status = CL_SUCCESS;

		/*** Ausgabe von Informationen ueber gewaehltes OpenCL-Device ***/
		/* TODO logging
		 Logger::logDebug(
		 METHOD,
		 Logger::sStream << "max compute units: " << devices[0].getInfo<
		 CL_DEVICE_MAX_COMPUTE_UNITS> ());
		 Logger::logDebug(
		 METHOD,
		 Logger::sStream << "max work item sizes: "
		 << devices[0].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES> ()[0]);
		 Logger::logDebug(
		 METHOD,
		 Logger::sStream << "max work group sizes: "
		 << devices[0].getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE> ());
		 Logger::logDebug(
		 METHOD,
		 Logger::sStream << "max global mem size (KB): "
		 << devices[0].getInfo<CL_DEVICE_GLOBAL_MEM_SIZE> ()
		 / 1024);
		 Logger::logDebug(
		 METHOD,
		 Logger::sStream << "max local mem size (KB): "
		 << devices[0].getInfo<CL_DEVICE_LOCAL_MEM_SIZE> ()
		 / 1024);
		 */

		/*** Erstellen und Vorbereiten der Daten ***/
		cl::Buffer vBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
				sizeof(cl_int) * len, &values[0], &status);
		if (status != CL_SUCCESS) {
			throw cl::Error(status, "cl::Buffer values");
		}
		cmdQ.finish();

		/*** Arbeitsgroeszen berechnen ***/
		// Anzahl der Work-Items = globalSize
		// Work-Items pro Work-Group = localSize
		const size_t MAX_GROUP_SIZE = devices[0].getInfo<
				CL_DEVICE_MAX_WORK_GROUP_SIZE> ();
		size_t globalSize;
		size_t localSize;

		do {
			globalSize = len;
			localSize = MaxValueSimple::calcWorkGroupSize(globalSize,
					MAX_GROUP_SIZE);
			if (localSize == 1) {
				globalSize = ceil((double) len / WG_FAC) * WG_FAC;
				localSize = MaxValueSimple::calcWorkGroupSize(globalSize,
						MAX_GROUP_SIZE);
				/* TODO logging
				 Logger::logDebug(
				 METHOD,
				 Logger::sStream << "GlobalSize has been extended to "
				 << globalSize);
				 */
			}
			/* TODO logging
			 Logger::logDebug(METHOD,
			 Logger::sStream << "globalSize: " << globalSize);
			 Logger::logDebug(METHOD,
			 Logger::sStream << "localSize: " << localSize);
			 */

			/*** Kernel-Argumente setzen  ***/
			status = kernel.setArg(0, vBuffer);
			if (status != CL_SUCCESS) {
				throw cl::Error(status, "Kernel.SetArg");
			}

			status = kernel.setArg(1, sizeof(cl_int) * localSize, NULL);
			if (status != CL_SUCCESS) {
				throw cl::Error(status, "Kernel.SetArg");
			}

			/*** Kernel ausfuehren und auf Abarbeitung warten ***/
			cl::KernelFunctor func = kernel.bind(cmdQ, cl::NDRange(globalSize),
					cl::NDRange(localSize));

			event = func();

			event.wait();
			cmdQ.finish();

			/*
			 runtimeKernel
			 += event.getProfilingInfo<CL_PROFILING_COMMAND_END> ();
			 runtimeKernel
			 -= event.getProfilingInfo<CL_PROFILING_COMMAND_START> ();
			 */
			len = globalSize / localSize;
		} while (globalSize > localSize && localSize > 1);

		/*** Daten vom OpenCL-Device holen ***/
		// TODO nur 1. element auslesen
		status = cmdQ.enqueueReadBuffer(vBuffer, true, 0, sizeof(cl_int) * 1,
				&values[0]);
		if (status != CL_SUCCESS) {
			throw cl::Error(status, "CommandQueue.enqueueReadBuffer");
		}

		/* TODO logging
		 Logger::log(
		 METHOD,
		 TIME,
		 Logger::sStream << "timeKernel=" << 1.0e-9 * runtimeKernel
		 << ";");
		 */
		return values[0];
	} catch (cl::Error& err) {
		// TODO Logger::logError(METHOD, Logger::sStream << err.what());
		std::cerr << "[ERROR] MaxValueSimple::maxValueCL(int*, size_t): "
				<< err.what() << " (" << err.err() << ")" << std::endl;
		return MaxValueSimple::MAX_FAILURE;
	} catch (std::exception& err) {
		// TODO Logger::logError(METHOD, Logger::sStream << err.what());
		std::cerr << "[ERROR] MaxValueSimple::maxValueCL(int*, size_t): "
				<< err.what() << std::endl;
		return MaxValueSimple::MAX_FAILURE;
	}
}
Esempio n. 25
0
int main()
{
    try {
        std::vector<cl::Device> devices;

        // select platform
        cl::Platform platform = selectPlatform();

        // select device
        platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
        cl::Device device = selectDevice(devices);

        // create context
        context = cl::Context(devices);

        // create command queue
        queue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE);

        // load opencl source
        std::ifstream cl_file("inclusive_scan.cl");

        std::string cl_string{std::istreambuf_iterator<char>(cl_file),
                    std::istreambuf_iterator<char>()};

        cl::Program::Sources source(1,
                                    std::make_pair(cl_string.c_str(),
                                                   cl_string.length() + 1));

        // create programm
        program = cl::Program(context, source);

        // compile opencl source
        try {
            program.build(devices);

            size_t input_size;
            std::ifstream input_file("input.txt");
            input_file >> input_size;

            std::vector<float> input(input_size);

//            for (size_t i = 0; i < input_size; ++i) {
//                input[i] = i % 10;
//            }

            for (int i = 0; i < input_size; i++) {
                input_file >> input[i];
            }

            std::vector<float> output(input_size, 0);

            cl::Buffer dev_input (context, CL_MEM_READ_ONLY, sizeof(float) * input_size);
            queue.enqueueWriteBuffer(dev_input, CL_TRUE, 0, sizeof(float) * input_size, &input[0]);

            cl::Buffer dev_output = inclusive_scan(dev_input, input_size);

            queue.enqueueReadBuffer(dev_output, CL_TRUE, 0, sizeof(float) * input_size, &output[0]);
            queue.finish();

            cpu_check(input, output);

            std::ofstream output_file("output.txt");
            for (int i = 0; i < input_size; i++) {
                output_file << output[i] << " ";
            }

        }
        catch (cl::Error const & e) {
            std::string log_str = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device);
            std::cout << std::endl << e.what() << " : " << e.err() << std::endl;
            std::cout << log_str;
            return 0;
        }


    }
    catch (cl::Error const & e) {
        std::cout << "Error: " << e.what() << " #" << e.err() << std::endl;
    }

    return 0;
}
Esempio n. 26
0
int clPeak::runTransferBandwidthTest(cl::CommandQueue &queue, cl::Program &prog, device_info_t &devInfo)
{
  if(!isTransferBW)
    return 0;

  float timed, gbps;
  cl::NDRange globalSize, localSize;
  cl::Context ctx = queue.getInfo<CL_QUEUE_CONTEXT>();
  int iters = devInfo.transferBWIters;
  Timer timer;
  float *arr = NULL;

  cl_uint maxItems = devInfo.maxAllocSize / sizeof(float) / 2;
  cl_uint numItems;

  // Set an upper-limit for cpu devies
  if(devInfo.deviceType & CL_DEVICE_TYPE_CPU) {
    numItems = roundToPowOf2(maxItems, 26);
  } else {
    numItems = roundToPowOf2(maxItems);
  }

  try
  {
    arr = new float[numItems];
    cl::Buffer clBuffer = cl::Buffer(ctx, (CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR), (numItems * sizeof(float)));

    log->print(NEWLINE TAB TAB "Transfer bandwidth (GBPS)" NEWLINE);
    log->xmlOpenTag("transfer_bandwidth");
    log->xmlAppendAttribs("unit", "gbps");

    ///////////////////////////////////////////////////////////////////////////
    // enqueueWriteBuffer
    log->print(TAB TAB TAB "enqueueWriteBuffer         : ");

    // Dummy warm-up
    queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr);
    queue.finish();

    timed = 0;

    if(useEventTimer)
    {
      for(int i=0; i<iters; i++)
      {
        cl::Event timeEvent;
        queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent);
        queue.finish();
        timed += timeInUS(timeEvent);
      }
    } else
    {
      Timer timer;

      timer.start();
      for(int i=0; i<iters; i++)
      {
        queue.enqueueWriteBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr);
      }
      queue.finish();
      timed = timer.stopAndTime();
    }
    timed /= iters;

    gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
    log->print(gbps);   log->print(NEWLINE);
    log->xmlRecord("enqueuewritebuffer", gbps);
    ///////////////////////////////////////////////////////////////////////////
    // enqueueReadBuffer
    log->print(TAB TAB TAB "enqueueReadBuffer          : ");

    // Dummy warm-up
    queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr);
    queue.finish();

    timed = 0;
    if(useEventTimer)
    {
      for(int i=0; i<iters; i++)
      {
        cl::Event timeEvent;
        queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr, NULL, &timeEvent);
        queue.finish();
        timed += timeInUS(timeEvent);
      }
    } else
    {
      Timer timer;

      timer.start();
      for(int i=0; i<iters; i++)
      {
        queue.enqueueReadBuffer(clBuffer, CL_TRUE, 0, (numItems * sizeof(float)), arr);
      }
      queue.finish();
      timed = timer.stopAndTime();
    }
    timed /= iters;

    gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
    log->print(gbps);   log->print(NEWLINE);
    log->xmlRecord("enqueuereadbuffer", gbps);
    ///////////////////////////////////////////////////////////////////////////
    // enqueueMapBuffer
    log->print(TAB TAB TAB "enqueueMapBuffer(for read) : ");

    queue.finish();

    timed = 0;
    if(useEventTimer)
    {
      for(int i=0; i<iters; i++)
      {
        cl::Event timeEvent;
        void *mapPtr;

        mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float)), NULL, &timeEvent);
        queue.finish();
        queue.enqueueUnmapMemObject(clBuffer, mapPtr);
        queue.finish();
        timed += timeInUS(timeEvent);
      }
    } else
    {
      for(int i=0; i<iters; i++)
      {
        Timer timer;
        void *mapPtr;

        timer.start();
        mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float)));
        queue.finish();
        timed += timer.stopAndTime();

        queue.enqueueUnmapMemObject(clBuffer, mapPtr);
        queue.finish();
      }
    }
    timed /= iters;

    gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
    log->print(gbps);   log->print(NEWLINE);
    log->xmlRecord("enqueuemapbuffer", gbps);
    ///////////////////////////////////////////////////////////////////////////

    // memcpy from mapped ptr
    log->print(TAB TAB TAB TAB "memcpy from mapped ptr   : ");
    queue.finish();

    timed = 0;
    for(int i=0; i<iters; i++)
    {
      cl::Event timeEvent;
      void *mapPtr;

      mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_READ, 0, (numItems * sizeof(float)));
      queue.finish();

      timer.start();
      memcpy(arr, mapPtr, (numItems * sizeof(float)));
      timed += timer.stopAndTime();

      queue.enqueueUnmapMemObject(clBuffer, mapPtr);
      queue.finish();
    }
    timed /= iters;

    gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
    log->print(gbps);   log->print(NEWLINE);
    log->xmlRecord("memcpy_from_mapped_ptr", gbps);

    ///////////////////////////////////////////////////////////////////////////

    // enqueueUnmap
    log->print(TAB TAB TAB "enqueueUnmap(after write)  : ");

    queue.finish();

    timed = 0;
    if(useEventTimer)
    {
      for(int i=0; i<iters; i++)
      {
        cl::Event timeEvent;
        void *mapPtr;

        mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float)));
        queue.finish();
        queue.enqueueUnmapMemObject(clBuffer, mapPtr, NULL, &timeEvent);
        queue.finish();
        timed += timeInUS(timeEvent);
      }
    } else
    {
      for(int i=0; i<iters; i++)
      {
        Timer timer;
        void *mapPtr;

        mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float)));
        queue.finish();

        timer.start();
        queue.enqueueUnmapMemObject(clBuffer, mapPtr);
        queue.finish();
        timed += timer.stopAndTime();
      }
    }
    timed /= iters;
    gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;

    log->print(gbps);   log->print(NEWLINE);
    log->xmlRecord("enqueueunmap", gbps);
    ///////////////////////////////////////////////////////////////////////////

    // memcpy to mapped ptr
    log->print(TAB TAB TAB TAB "memcpy to mapped ptr     : ");
    queue.finish();

    timed = 0;
    for(int i=0; i<iters; i++)
    {
      cl::Event timeEvent;
      void *mapPtr;

      mapPtr = queue.enqueueMapBuffer(clBuffer, CL_TRUE, CL_MAP_WRITE, 0, (numItems * sizeof(float)));
      queue.finish();

      timer.start();
      memcpy(mapPtr, arr, (numItems * sizeof(float)));
      timed += timer.stopAndTime();

      queue.enqueueUnmapMemObject(clBuffer, mapPtr);
      queue.finish();
    }
    timed /= iters;

    gbps = ((float)numItems * sizeof(float)) / timed / 1e3f;
    log->print(gbps);   log->print(NEWLINE);
    log->xmlRecord("memcpy_to_mapped_ptr", gbps);

    ///////////////////////////////////////////////////////////////////////////
    log->xmlCloseTag();     // transfer_bandwidth

    if(arr)     delete [] arr;
  }
  catch(cl::Error error)
  {
    stringstream ss;
    ss << error.what() << " (" << error.err() << ")" NEWLINE
       << TAB TAB TAB "Tests skipped" NEWLINE;
    log->print(ss.str());

    if(arr)     delete [] arr;
    return -1;
  }

  return 0;
}
Esempio n. 27
0
bool runTestType(cl::Context context, cl::CommandQueue queue)
{
  cl_uint size = 64 * 256;

  std::vector<T> input(size);

  for(size_t i = 0; i < input.size(); ++i)
    input[i] = input.size() - i - 1;
  
  // create input buffer using pinned memory
  cl::Buffer bufferIn(context, CL_MEM_ALLOC_HOST_PTR |
		      CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, 
		      sizeof(T) * input.size(), &input[0])
    ;
  
  magnet::CL::sort<T> sortFunctor;
  sortFunctor.build(queue, context);
  sortFunctor(bufferIn);

  std::cout << "##Testing generic sort (";
  switch(sortFunctor.getMode())
    {
    case magnet::CL::sort<T>::CPU:
      std::cout << "HeapSort";
      break;
    case magnet::CL::sort<T>::NVIDIA:
      std::cout << "radixNVIDIA";
      break;
    case magnet::CL::sort<T>::AMD:
      std::cout << "radixAMD";
      break;
    default:
      M_throw() << "Could not determine which sorting algorithm is being used";
    }

  std::cout << ") for " << input.size() << " elements and type " 
	    << magnet::CL::detail::traits<T>::kernel_type();
  

  std::vector<T> output(size);
 
  queue.enqueueReadBuffer(bufferIn, CL_TRUE, 0, input.size() *
			  sizeof(T), &output[0]);

  bool failed = !testOutput(input, output);

  std::cout << " key(only) " << (failed ? "FAILED" : "PASSED") << ", "; 

  //Now test with some data!
  //Refresh the input array
  queue.enqueueWriteBuffer(bufferIn, CL_TRUE, 0, input.size() *
			   sizeof(T), &input[0]);

  //Write a data array
  std::vector<cl_uint> data(size);
  for(size_t i = 0; i < input.size(); ++i)
    data[i] = i;

  cl::Buffer dataIn(context, CL_MEM_ALLOC_HOST_PTR |
		    CL_MEM_COPY_HOST_PTR | CL_MEM_READ_WRITE, 
		    sizeof(cl_uint) * data.size(), &data[0])
    ;

  sortFunctor(bufferIn, dataIn);
  
  queue.enqueueReadBuffer(dataIn, CL_TRUE, 0, data.size() *
			  sizeof(cl_uint), &data[0]);

  bool keyfail = false;//!testOutput(input, output);

  std::cout << " key " << (keyfail ? "FAILED" : "PASSED"); 

  bool datafail = false;
  for(size_t i = 0; i < input.size(); ++i)
    if (data[i] != input.size() - 1 - i)
      datafail = true;

  std::cout << " data " << (datafail ? "FAILED" : "PASSED") << std::endl;
  
  return failed || keyfail || datafail;
}