示例#1
0
cl_int WINAPI wine_clEnqueueBarrier(cl_command_queue command_queue)
{
    cl_int ret;
    TRACE("\n");
    ret = clEnqueueBarrier(command_queue);
    return ret;
}
示例#2
0
    /// Enqueues a barrier in the queue.
    void enqueue_barrier()
    {
        BOOST_ASSERT(m_queue != 0);

        #ifdef CL_VERSION_1_2
        clEnqueueBarrierWithWaitList(m_queue, 0, 0, 0);
        #else
        clEnqueueBarrier(m_queue);
        #endif
    }
示例#3
0
void cl_launch_kernel(int ni, int nj, int nk, int nl, DATA_TYPE alpha, DATA_TYPE beta)
{
	size_t localWorkSize[2], globalWorkSize[2];
	localWorkSize[0] = DIM_LOCAL_WORK_GROUP_X;
	localWorkSize[1] = DIM_LOCAL_WORK_GROUP_Y;
	globalWorkSize[0] = (size_t)ceil(((float)NI) / ((float)DIM_LOCAL_WORK_GROUP_X)) * DIM_LOCAL_WORK_GROUP_X;
	globalWorkSize[1] = (size_t)ceil(((float)NL) / ((float)DIM_LOCAL_WORK_GROUP_Y)) * DIM_LOCAL_WORK_GROUP_Y;

	/* Start timer. */
  	polybench_start_instruments;
	
	// Set the arguments of the kernel
	errcode =  clSetKernelArg(clKernel1, 0, sizeof(cl_mem), (void *)&tmp_mem_obj);
	errcode |= clSetKernelArg(clKernel1, 1, sizeof(cl_mem), (void *)&a_mem_obj);
	errcode |= clSetKernelArg(clKernel1, 2, sizeof(cl_mem), (void *)&b_mem_obj);
	errcode |= clSetKernelArg(clKernel1, 3, sizeof(int), (void *)&ni);
	errcode |= clSetKernelArg(clKernel1, 4, sizeof(int), (void *)&nj);
	errcode |= clSetKernelArg(clKernel1, 5, sizeof(int), (void *)&nk);
	errcode |= clSetKernelArg(clKernel1, 6, sizeof(int), (void *)&nl);
	errcode |= clSetKernelArg(clKernel1, 7, sizeof(DATA_TYPE), (void *)&alpha);
	errcode |= clSetKernelArg(clKernel1, 8, sizeof(DATA_TYPE), (void *)&beta);
	if(errcode != CL_SUCCESS) printf("Error in seting arguments\n");
	// Execute the OpenCL kernel
	errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel1, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in launching kernel\n");
	clEnqueueBarrier(clCommandQue);

	globalWorkSize[0] = (size_t)ceil(((float)NI) / ((float)DIM_LOCAL_WORK_GROUP_X)) * DIM_LOCAL_WORK_GROUP_X;
	globalWorkSize[1] = (size_t)ceil(((float)NL) / ((float)DIM_LOCAL_WORK_GROUP_Y)) * DIM_LOCAL_WORK_GROUP_Y;
	
	errcode =  clSetKernelArg(clKernel2, 0, sizeof(cl_mem), (void *)&tmp_mem_obj);
	errcode |= clSetKernelArg(clKernel2, 1, sizeof(cl_mem), (void *)&c_mem_obj);
	errcode |= clSetKernelArg(clKernel2, 2, sizeof(cl_mem), (void *)&dOutputFromGpu_mem_obj);
	errcode |= clSetKernelArg(clKernel2, 3, sizeof(int), (void *)&ni);
	errcode |= clSetKernelArg(clKernel2, 4, sizeof(int), (void *)&nj);
	errcode |= clSetKernelArg(clKernel2, 5, sizeof(int), (void *)&nk);
	errcode |= clSetKernelArg(clKernel2, 6, sizeof(int), (void *)&nl);
	errcode |= clSetKernelArg(clKernel2, 7, sizeof(DATA_TYPE), (void *)&alpha);
	errcode |= clSetKernelArg(clKernel2, 8, sizeof(DATA_TYPE), (void *)&beta);
	if(errcode != CL_SUCCESS) printf("Error in seting arguments\n");

	// Execute the OpenCL kernel
	errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel2, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in launching kernel\n");
	clFinish(clCommandQue);

	/* Stop and print timer. */
	printf("GPU Time in seconds:\n");
  	polybench_stop_instruments;
 	polybench_print_instruments;
}
示例#4
0
void cl_launch_kernel(int nx, int ny)
{
	size_t localWorkSize[2], globalWorkSize[2];
	localWorkSize[0] = DIM_LOCAL_WORK_GROUP_X;
	localWorkSize[1] = DIM_LOCAL_WORK_GROUP_Y;
	globalWorkSize[0] = (size_t)ceil(((float)NX) / ((float)DIM_LOCAL_WORK_GROUP_X)) * DIM_LOCAL_WORK_GROUP_X;
	globalWorkSize[1] = 1;

	/* Start timer. */
  	polybench_start_instruments;
	
	// Set the arguments of the kernel
	errcode =  clSetKernelArg(clKernel1, 0, sizeof(cl_mem), (void *)&a_mem_obj);
	errcode |= clSetKernelArg(clKernel1, 1, sizeof(cl_mem), (void *)&x_mem_obj);
	errcode |= clSetKernelArg(clKernel1, 2, sizeof(cl_mem), (void *)&tmp_mem_obj);
	errcode |= clSetKernelArg(clKernel1, 3, sizeof(int), (void *)&nx);
	errcode |= clSetKernelArg(clKernel1, 4, sizeof(int), (void *)&ny);
	if(errcode != CL_SUCCESS) printf("Error in setting arguments\n");

	// Execute the OpenCL kernel
	errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel1, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in launching kernel\n");
	clEnqueueBarrier(clCommandQue);
	
	globalWorkSize[0] = (size_t)ceil(((float)NY) / ((float)DIM_LOCAL_WORK_GROUP_X)) * DIM_LOCAL_WORK_GROUP_X;
	globalWorkSize[1] = 1;

	// Set the arguments of the kernel
	errcode =  clSetKernelArg(clKernel2, 0, sizeof(cl_mem), (void *)&a_mem_obj);
	errcode |= clSetKernelArg(clKernel2, 1, sizeof(cl_mem), (void *)&y_mem_obj);
	errcode |= clSetKernelArg(clKernel2, 2, sizeof(cl_mem), (void *)&tmp_mem_obj);
	errcode |= clSetKernelArg(clKernel2, 3, sizeof(int), (void *)&nx);
	errcode |= clSetKernelArg(clKernel2, 4, sizeof(int), (void *)&ny);
	if(errcode != CL_SUCCESS) printf("Error in seting arguments\n");
	errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel2, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in launching kernel\n");
	clFinish(clCommandQue);

	/* Stop and print timer. */
	printf("GPU Time in seconds:\n");
  	polybench_stop_instruments;
 	polybench_print_instruments;
}
bool ParallelBitonicASort::sort(int n, cl_mem in, cl_mem out) const
{
	int clStatus;
	int nk = 0;

	cl_mem buffers[2];
	buffers[0] = in;
	buffers[1] = out;
	int current = 0;


	for (int length = 1; length<n; length <<= 1) 
		for (int inc = length; inc>0; inc >>= 1)
		{
			//c->clearArgs(kid);
			clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffers[current]);
			clSetKernelArg(kernel, 1, sizeof(cl_mem), &buffers[1 - current]);
			clSetKernelArg(kernel, 2, sizeof(int), &inc);
			int tmp = length << 1;
			clSetKernelArg(kernel, 3, sizeof(int), &tmp);

			//c->pushArg(kid, );
			//c->pushArg(kid, buffers[1 - current]);
			//c->pushArg(kid, length << 1);
			
			size_t global_size = n;
			size_t local_size = 256;
			//c->enqueueKernel(targetDevice, kid, n, 1, mWG, 1, EventVector());

			clStatus = clEnqueueNDRangeKernel(env.get_command_queue(), kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
			clEnqueueBarrier(env.get_command_queue());
			//c->enqueueKernel(targetDevice, kid, n, 1, 256, 1, EventVector());
			//c->enqueueBarrier(targetDevice); // sync
			current = 1 - current;
			nk++;
		}

	clFinish(env.get_command_queue());

	return (current == 1);  // output must be in OUT
}
示例#6
0
void oclCombo1col1( double *a, double *D, double *tausqy, 
      double* tausqphi, double *By, double *results,
      int *na1, int *nc1, int *F1) 
{

  int i, j, k, iter ;
  int na = na1[0], nc = nc1[0], F=F1[0];
  int Fm1 = F - 1 ;

  double *Bphi ;
  double neweigendenom, normmean, normstd ;

  size_t sizea = na * na*sizeof(double);
  size_t sizec = na * sizeof(double); // Changed from mat ver
  size_t sizer = 2 * na * sizeof(double); // Changed from mat ver

  // allocate array on host
 
  Bphi = (double *)malloc(sizec);
  for (i = 0; i < na; i++)
       Bphi[i] = 0.0f ;

    // Use this to check the output of each API call
    //     cl_int status;
    

    cl_int status;
    //-----------------------------------------------------
    // STEP 1: Discover and initialize the platforms
    //-----------------------------------------------------

    cl_uint numPlatforms = 0;
    cl_platform_id *platforms = NULL;

    // Use clGetPlatformIDs() to retrieve the number of 
    // platforms
    status = clGetPlatformIDs(0, NULL, &numPlatforms);

    if (status != CL_SUCCESS) {
        /* printf( "Error getting platform id %d.\n", status );
        exit(status); */
     }

    // Allocate enough space for each platform
    platforms =
        (cl_platform_id*)malloc(
           numPlatforms*sizeof(cl_platform_id));
    
    // Fill in platforms with clGetPlatformIDs()
    status = clGetPlatformIDs(numPlatforms, platforms,
                NULL);

    if (status != CL_SUCCESS) {
        /* printf( "Error getting platform id.\n" );
        exit(status);  */
     }

    cl_uint numDevices = 0;
    cl_device_id *devices = NULL;

    status = clGetDeviceIDs(
        platforms[0],
        CL_DEVICE_TYPE_ALL,
        0,
        NULL,
        &numDevices);

    if (status != CL_SUCCESS) {
       /* printf( "Error getting device id.\n" );
        exit(status); */
     }

    devices =
        (cl_device_id*)malloc(
            numDevices*sizeof(cl_device_id));


    status = clGetDeviceIDs(
        platforms[0],
        CL_DEVICE_TYPE_ALL,
        numDevices,
        devices,
        NULL);

    if (status != CL_SUCCESS) {
        /* printf( "Error getting device id.\n" );
        exit(status); */
     }

    cl_context context = NULL;

    context = clCreateContext(
        NULL,
        numDevices,
        devices,
        NULL,
        NULL,
        &status);

    if (status != CL_SUCCESS) {
        /* printf( "Error creating context.\n" );
        exit(status);  */
     }

    cl_command_queue cmdQueue;

    cmdQueue = clCreateCommandQueue(
        context,
        devices[0],
        0,
        &status);

    if (status != CL_SUCCESS) {
        /*printf( "Error creating command queue.\n" );
        exit(status); */
     }

    //-----------------------------------------------------
    // STEP 5: Create device buffers
    //----------------------------------------------------- 

    cl_mem buffera;  // Input array on the device
    cl_mem bufferc;  // Input array on the device
    cl_mem bufferresult;  // Output array on the device


    buffera = clCreateBuffer(
        context,
        CL_MEM_READ_ONLY,
        sizea,
        NULL,
        &status);

    if (status != CL_SUCCESS) {
        /* printf( "Error creating buffera.\n" );
        exit(status); */
     }

    bufferc = clCreateBuffer(
        context,
        CL_MEM_READ_ONLY,
        sizec,
        NULL,
        &status);

    if (status != CL_SUCCESS) {
        /* printf( "Error creating bufferc.\n" );
        exit(status); */
     }

    bufferresult = clCreateBuffer(
        context,
        CL_MEM_READ_WRITE,
        sizer,
        NULL,
        &status);

    if (status != CL_SUCCESS) {
        /* printf( "Error creating bufferresult.\n" );
        exit(status); */
     }




    status = clEnqueueWriteBuffer(
        cmdQueue,
        buffera,
        CL_TRUE,
        0,
        sizea,
        a,
        0,
        NULL,
        NULL);

    if (status != CL_SUCCESS) {
        /* printf( "Error writing buffera.\n" );
        exit(status); */
     }

    status = clEnqueueWriteBuffer(
        cmdQueue,
        bufferc,
        CL_TRUE,
        0,
        sizec,
        Bphi,
        0,
        NULL,
        NULL);

    if (status != CL_SUCCESS) {
        /* printf( "Error writing bufferb.\n" );
        exit(status); */
     }

    status = clEnqueueWriteBuffer(
        cmdQueue,
        bufferresult,
        CL_TRUE,
        0,
        sizer,
        results,
        0,
        NULL,
        NULL);

    if (status != CL_SUCCESS) {
        /* printf( "Error writing buffer result %d.\n", status );
        exit(status); */
     }


    cl_program program = clCreateProgramWithSource(
        context,
        1,
        (const char**)&programSource1,
        NULL,
        &status);
    if (status != CL_SUCCESS) {
        /*printf( "Error creating program with source %d.\n", status );
        //printf(" This is the new version.\n") ; */
        
        //exit(status);
     }



    status = clBuildProgram(
        program,
        numDevices,
        devices,
        NULL,
        NULL,
        NULL);

    if (status != CL_SUCCESS) {
      /*  printf( "Error building program %d.\n", status );
	char buffer[10240]; */
//	clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG,
  //             sizeof(buffer), buffer, NULL);
//	printf("CL Compilation failed:\n%s", buffer);
  //      exit(status);
     }



    cl_kernel kernel = NULL;
    kernel = clCreateKernel(program, "kronVectMult1colOnDevice", &status);

    if (status != CL_SUCCESS) {
     }


    status  = clSetKernelArg(
        kernel,
        0,
        sizeof(cl_mem),
        &buffera);
  /*  status  |= clSetKernelArg(
        kernel,
        2,
        sizeof(cl_mem),
        &bufferc);  */
    status  |= clSetKernelArg(
        kernel,
        2,
        sizeof(cl_mem),
        &bufferresult);
    status |= clSetKernelArg(
        kernel,
        3,
        sizeof(int),
       &na);

    if (status != CL_SUCCESS) {
     }

  size_t globalWorkSize[1];
  globalWorkSize[0] = na ;

// get R's RNG seed 
GetRNGstate();

for(i = 0; i < nc; i++)  // for each row in output
{
  for( j=0; j < na; j++ )   // for each data element
  {
     neweigendenom = tausqy[i] ;
     for( k = 0; k < Fm1; k++) 
        neweigendenom += D[j * Fm1 +k] * tausqphi[i * Fm1 + k ] ;
     normmean = tausqy[i] * By[j] / neweigendenom ;
     normstd = 1.0 / sqrt(neweigendenom) ;
     Bphi[j] = rnorm( normmean, normstd ) ;
  }
  
    status = clEnqueueWriteBuffer(
        cmdQueue,
        bufferc,
        CL_TRUE,
        0,
        sizec,
        Bphi,
        0,
        NULL,
        NULL);  

    if (status != CL_SUCCESS) {
     }

  iter = i + 1 ;

    status  = clSetKernelArg(
        kernel,
        1,
        sizeof(cl_mem),
        &bufferc);  
    status |= clSetKernelArg(
        kernel,
        4,
        sizeof(int),
       &iter);

    if (status != CL_SUCCESS) {
     }

   // new 07/26/13
   clEnqueueBarrier(cmdQueue);

  // do calculation on device:

    status = clEnqueueNDRangeKernel(
        cmdQueue,
        kernel,
        1,
        NULL,
        globalWorkSize,
        NULL,
        0,
        NULL,
        NULL);

    if (status != CL_SUCCESS) {
     }

   // new 07/26/13
   clEnqueueBarrier(cmdQueue);

}

// done gen rand numbers; send seed state back to R
PutRNGstate(); 

  // Retrieve result from device 


    clEnqueueReadBuffer(
        cmdQueue,
        bufferresult,
        CL_TRUE,
        0,
        sizer,
        results,
        0,
        NULL,
        NULL);

    if (status != CL_SUCCESS) {
     }


  // clean up

    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseCommandQueue(cmdQueue);
    clReleaseMemObject(buffera);
    clReleaseMemObject(bufferc);
    clReleaseMemObject(bufferresult);
    clReleaseContext(context);

    free(Bphi);
    free(platforms);
    free(devices);


}
示例#7
0
void cl_launch_kernel()
{
	double t_start, t_end;

	int nx = NX;
	int ny = NY;

	size_t localWorkSize[2], globalWorkSize[2];
	localWorkSize[0] = DIM_LOCAL_WORK_GROUP_X;
	localWorkSize[1] = DIM_LOCAL_WORK_GROUP_Y;
	globalWorkSize[0] = (size_t)ceil(((float)NY) / ((float)DIM_LOCAL_WORK_GROUP_X)) * DIM_LOCAL_WORK_GROUP_X;
	globalWorkSize[1] = (size_t)ceil(((float)NX) / ((float)DIM_LOCAL_WORK_GROUP_Y)) * DIM_LOCAL_WORK_GROUP_Y;

	t_start = rtclock();
	int t;
	for(t=0;t<TMAX;t++)
	{
		// Set the arguments of the kernel
		errcode =  clSetKernelArg(clKernel1, 0, sizeof(cl_mem), (void *)&fict_mem_obj);
		errcode =  clSetKernelArg(clKernel1, 1, sizeof(cl_mem), (void *)&ex_mem_obj);
		errcode |= clSetKernelArg(clKernel1, 2, sizeof(cl_mem), (void *)&ey_mem_obj);
		errcode |= clSetKernelArg(clKernel1, 3, sizeof(cl_mem), (void *)&hz_mem_obj);
		errcode |= clSetKernelArg(clKernel1, 4, sizeof(int), (void *)&t);
		errcode |= clSetKernelArg(clKernel1, 5, sizeof(int), (void *)&nx);
		errcode |= clSetKernelArg(clKernel1, 6, sizeof(int), (void *)&ny);
		
		if(errcode != CL_SUCCESS) printf("Error in seting arguments1\n");
		// Execute the OpenCL kernel
		errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel1, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
		if(errcode != CL_SUCCESS) printf("Error in launching kernel1\n");
		clEnqueueBarrier(clCommandQue);

		// Set the arguments of the kernel
		errcode =  clSetKernelArg(clKernel2, 0, sizeof(cl_mem), (void *)&ex_mem_obj);
		errcode |= clSetKernelArg(clKernel2, 1, sizeof(cl_mem), (void *)&ey_mem_obj);
		errcode |= clSetKernelArg(clKernel2, 2, sizeof(cl_mem), (void *)&hz_mem_obj);
		errcode |= clSetKernelArg(clKernel2, 3, sizeof(int), (void *)&nx);
		errcode |= clSetKernelArg(clKernel2, 4, sizeof(int), (void *)&ny);
		
		if(errcode != CL_SUCCESS) printf("Error in seting arguments1\n");
		// Execute the OpenCL kernel
		errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel2, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
		if(errcode != CL_SUCCESS) printf("Error in launching kernel1\n");
		clEnqueueBarrier(clCommandQue);

		// Set the arguments of the kernel
		errcode =  clSetKernelArg(clKernel3, 0, sizeof(cl_mem), (void *)&ex_mem_obj);
		errcode |= clSetKernelArg(clKernel3, 1, sizeof(cl_mem), (void *)&ey_mem_obj);
		errcode |= clSetKernelArg(clKernel3, 2, sizeof(cl_mem), (void *)&hz_mem_obj);
		errcode |= clSetKernelArg(clKernel3, 3, sizeof(int), (void *)&nx);
		errcode |= clSetKernelArg(clKernel3, 4, sizeof(int), (void *)&ny);
		
		if(errcode != CL_SUCCESS) printf("Error in seting arguments1\n");
		// Execute the OpenCL kernel
		errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel3, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
		if(errcode != CL_SUCCESS) printf("Error in launching kernel1\n");
		clFinish(clCommandQue);
	}


	

	t_end = rtclock();
	fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start);
}
示例#8
0
void StepWorldV4DoubleBuffered(world_t &world, float dt, unsigned n)
{
    distCL::distributedCL distributedCL(NULL, NULL);
    int devType = CL_DEVICE_TYPE_GPU;

    cl_int err;
    // Get platforms
    cl_platform_id cpPlatform; // OpenCL platform
    cl_device_id device_id;    // compute device id
    cl_context context;        // compute context
    cl_command_queue commands; // compute command queue
    cl_program program;        // compute program
    cl_kernel kernel;          // compute kernel

    err = distributedCL.GetPlatformIDs(1, &cpPlatform, NULL, {});
    err = distributedCL.GetDeviceIDs(cpPlatform, devType, 1, &device_id, NULL, {});
    err = distributedCL.CreateContext(&context, 0, 1, &device_id, NULL, NULL, {});
    err = distributedCL.CreateCommandQueue(&commands, context, device_id, 0, {});

    // Load kernel to string
    std::string kernelSource = LoadSource("step_world_v3_kernel.cl");
    const char *src = kernelSource.c_str();

    err = distributedCL.CreateProgramWithSource(&program, context, 1,
                                                &src,
                                                NULL, {});

    err = distributedCL.BuildProgram(program, 0, NULL, NULL, NULL, NULL, {});

    cl_mem buffProperties, buffState, buffBuffer;

    sync_data(&world.w, 0);
    sync_data(&world.h, 0);
    sync_data(&world.alpha, 0);

    fprintf(stderr, "%d %d %f\n", world.w, world.h, world.alpha);

    size_t world_size = world.w * world.h;

    data_barrier<uint32_t> properties = distributedCL.CreateBarrier<uint32_t>(world_size, world.w, context, { 0, 1 });
    if (distributedCL.world_rank == 0)
    {
        std::copy(&world.properties[0], &world.properties[0] + world_size, properties.data);
    }

    

    data_barrier<float> state = distributedCL.CreateBarrier<float>(world_size, world.w, context, { 0, 1 });
    if (distributedCL.world_rank == 0)
    {
        std::copy(&world.state[0], &world.state[0] + world_size, state.data);
    }

    err = distributedCL.CreateBuffer(&buffProperties, context, CL_MEM_READ_ONLY,
                                     world_size * sizeof(uint32_t), NULL, {});
    err = distributedCL.CreateBuffer(&buffState, context, CL_MEM_READ_WRITE,
                                     world_size * sizeof(float), NULL, {});
    err = distributedCL.CreateBuffer(&buffBuffer, context, CL_MEM_READ_WRITE,
                                     world_size * sizeof(float), NULL, {});

    if (!buffProperties || !buffState || !buffBuffer)
    {
        std::cerr << "Error: Failed to allocate device memory!" << std::endl;
        exit(1);
    }

    err = distributedCL.CreateKernel(&kernel, program, "kernel_xy", {});

    float outer = world.alpha * dt; // We spread alpha to other cells per time
    float inner = 1 - outer / 4;    // Anything that doesn't spread stays

    err = distributedCL.SetKernelArg(kernel, 0, sizeof(float),
                                     &inner, {});

    err |= distributedCL.SetKernelArg(kernel, 1, sizeof(float),
                                      &outer, {});
    err |= distributedCL.SetKernelArg(kernel, 3, sizeof(cl_mem),
                                      &buffProperties, {});

    if (err != CL_SUCCESS)
    {
        std::cerr << "Error: Failed to set kernel arguments! " << err << std::endl;
        exit(1);
    }

    err = distributedCL.EnqueueWriteBuffer(commands, buffProperties, CL_TRUE,
                                           0, world_size,
                                           &properties, 0,
                                           0, NULL,
                                           NULL, NULL,
                                           0, { 0, 1 });

    err = distributedCL.EnqueueWriteBuffer(commands, buffState, CL_TRUE,
                                           0, world_size,
                                           &state, 0,
                                           0, NULL,
                                           NULL, NULL,
                                           0, { 0, 1 });


    size_t *offset = new size_t[2];
    offset[0] = 0;
    offset[1] = 0;

    size_t *global = new size_t[2];
    global[0] = world.w;
    global[1] = world.h;

    size_t row = world.w;

    for (unsigned t = 0; t < n; ++t)
    {
        err = distributedCL.SetKernelArg(kernel, 2, sizeof(cl_mem),
                                         &buffState, {});
        err = distributedCL.SetKernelArg(kernel, 4, sizeof(cl_mem),
                                         &buffBuffer, {});

        err = distributedCL.EnqueueNDRangeKernel(commands, kernel,
                                                 2,
                                                 offset,
                                                 global,
                                                 NULL,
                                                 0,
                                                 NULL,
                                                 NULL, {});


        //  err = distributedCL.EnqueueReadBuffer(commands, buffState, CL_TRUE,
        //                                       row * 4, row,
        //                                       &state, row * 4,
        //                                       0, NULL,
        //                                       NULL, NULL,
        //                                       0, { 1 });

        // // if(distributedCL.world_rank == 1)
        // {
        //   fprintf(stderr, "%u   ", t);
        //   for (int i = 0; i <row; ++i)
        //   {
        //     fprintf(stderr, "%f ", state.data[row*4 + i]);
        //   }
        //   fprintf(stderr, "\n");
        // }
        clEnqueueBarrier(commands);
        
        std::swap(buffState, buffBuffer);
        world.t += dt;
    }

    delete[] global;
    delete[] offset;

    // err = clEnqueueReadBuffer(commands, buffState, CL_TRUE, 0, world_size, &world.state[0], 0, NULL, NULL);
    err = distributedCL.EnqueueReadBuffer(commands, buffState, CL_TRUE, 0,
                                          world_size, &state, 0, 0, NULL,
                                          NULL, NULL,
                                          0, { 0 });



    if (distributedCL.world_rank == 0)
    {
        std::copy(state.data, state.data + world_size, &world.state[0]);
    }
        clReleaseMemObject(buffProperties);
        clReleaseMemObject(buffState);
        clReleaseMemObject(buffBuffer);
        clReleaseProgram(program);
        clReleaseKernel(kernel);
        clReleaseCommandQueue(commands);
        clReleaseContext(context);
  

    distributedCL.Finalize();
}
void MFNHashTypePlainOpenCL::synchronizeThreads() {
    trace_printf("MFNHashTypePlainOpenCL::synchronizeThreads()\n");
    clEnqueueBarrier(this->OpenCL->getCommandQueue());
}
示例#10
0
void Extrae_OpenCL_clCreateCommandQueue (cl_command_queue queue,
	cl_device_id device, cl_command_queue_properties properties)
{
	if (!Extrae_OpenCL_lookForOpenCLQueue (queue, NULL))
	{
		cl_int err;
		char _threadname[THREAD_INFO_NAME_LEN];
		char _hostname[HOST_NAME_MAX];
		char *_device_type;
		int prev_threadid, found, idx;
		cl_device_type device_type;
		cl_event event;

		idx = nCommandQueues;
		CommandQueues = (RegisteredCommandQueue_t*) realloc (
			CommandQueues,
			sizeof(RegisteredCommandQueue_t)*(nCommandQueues+1));
		if (CommandQueues == NULL)
		{
			fprintf (stderr, PACKAGE_NAME": Fatal error! Failed to allocate memory for OpenCL Command Queues\n");
			exit (-1);
		}

		CommandQueues[idx].queue = queue;
		CommandQueues[idx].isOutOfOrder =
			(properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0;

		err = clGetDeviceInfo (device, CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL);
		if (err == CL_SUCCESS)
		{
			if (device_type  == CL_DEVICE_TYPE_GPU)
				_device_type = "GPU";
			else if (device_type == CL_DEVICE_TYPE_CPU)
				_device_type = "CPU";
			else
				_device_type = "Other";
		}
		else
			_device_type = "Unknown";

		/* Was the thread created before (i.e. did we executed a cudadevicereset?) */
		if (gethostname(_hostname, HOST_NAME_MAX) == 0)
			sprintf (_threadname, "OpenCL-%s-CQ%d-%s", _device_type, 1+idx,
			  _hostname);
		else
			sprintf (_threadname, "OpenCL-%s-CQ%d-%s", _device_type, 1+idx,
			  "unknown-host");

		prev_threadid = Extrae_search_thread_name (_threadname, &found);

		if (found)
		{
			/* If thread name existed, reuse its thread id */
			CommandQueues[idx].threadid = prev_threadid;
		}
		else
		{
			/* For timing purposes we change num of threads here instead of doing Backend_getNumberOfThreads() + CUDAdevices*/
			Backend_ChangeNumberOfThreads (Backend_getNumberOfThreads() + 1);
			CommandQueues[idx].threadid = Backend_getNumberOfThreads()-1;

			/* Set thread name */
			Extrae_set_thread_name (CommandQueues[idx].threadid, _threadname);
		}

		CommandQueues[idx].nevents = 0;

#ifdef CL_VERSION_1_2
		err = clEnqueueBarrierWithWaitList (queue, 0, NULL, &event);
#else
		err = clEnqueueBarrier (queue);
		if (err == CL_SUCCESS)
			err = clEnqueueMarker (queue, &event);
#endif
		CommandQueues[idx].host_reference_time = TIME;

		if (err == CL_SUCCESS)
		{
			err = clFinish(queue);
			if (err != CL_SUCCESS)
			{
				fprintf (stderr, PACKAGE_NAME": Error in clFinish (error = %d)! Dying...\n", err);
				exit (-1);
			}

			err = clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_SUBMIT,
				sizeof(cl_ulong), &(CommandQueues[idx].device_reference_time),
				NULL);
			if (err != CL_SUCCESS)
			{
				fprintf (stderr, PACKAGE_NAME": Error in clGetEventProfilingInfo (error = %d)! Dying...\n", err);
				exit (-1);
			}
		}
		else
		{
			fprintf (stderr, PACKAGE_NAME": Error while looking for clock references in host & accelerator\n");
			exit (-1);
		}

		nCommandQueues++;
	}
}
void WriteBufferOperation::executeOpenCLRegion(OpenCLDevice *device, rcti *rect, unsigned int chunkNumber,
                                               MemoryBuffer **inputMemoryBuffers, MemoryBuffer *outputBuffer)
{
	float *outputFloatBuffer = outputBuffer->getBuffer();
	cl_int error;
	/*
	 * 1. create cl_mem from outputbuffer
	 * 2. call NodeOperation (input) executeOpenCLChunk(.....)
	 * 3. schedule readback from opencl to main device (outputbuffer)
	 * 4. schedule native callback
	 *
	 * note: list of cl_mem will be filled by 2, and needs to be cleaned up by 4
	 */
	// STEP 1
	const unsigned int outputBufferWidth = outputBuffer->getWidth();
	const unsigned int outputBufferHeight = outputBuffer->getHeight();

	const cl_image_format imageFormat = {
		CL_RGBA,
		CL_FLOAT
	};

	cl_mem clOutputBuffer = clCreateImage2D(device->getContext(), CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &imageFormat, outputBufferWidth, outputBufferHeight, 0, outputFloatBuffer, &error);
	if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));  }
	
	// STEP 2
	list<cl_mem> *clMemToCleanUp = new list<cl_mem>();
	clMemToCleanUp->push_back(clOutputBuffer);
	list<cl_kernel> *clKernelsToCleanUp = new list<cl_kernel>();

	this->m_input->executeOpenCL(device, outputBuffer, clOutputBuffer, inputMemoryBuffers, clMemToCleanUp, clKernelsToCleanUp);

	// STEP 3

	size_t origin[3] = {0, 0, 0};
	size_t region[3] = {outputBufferWidth, outputBufferHeight, 1};

//	clFlush(queue);
//	clFinish(queue);

	error = clEnqueueBarrier(device->getQueue());
	if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));  }
	error = clEnqueueReadImage(device->getQueue(), clOutputBuffer, CL_TRUE, origin, region, 0, 0, outputFloatBuffer, 0, NULL, NULL);
	if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));  }
	
	this->getMemoryProxy()->getBuffer()->copyContentFrom(outputBuffer);

	// STEP 4
	while (!clMemToCleanUp->empty()) {
		cl_mem mem = clMemToCleanUp->front();
		error = clReleaseMemObject(mem);
		if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));  }
		clMemToCleanUp->pop_front();
	}

	while (!clKernelsToCleanUp->empty()) {
		cl_kernel kernel = clKernelsToCleanUp->front();
		error = clReleaseKernel(kernel);
		if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error));  }
		clKernelsToCleanUp->pop_front();
	}
	delete clKernelsToCleanUp;
}
void cl_launch_kernel()
{
	double t_start, t_end;

	int m = M;
	int n = N;

	DATA_TYPE float_n = FLOAT_N;
	DATA_TYPE eps = EPS;

	DATA_TYPE val = 1.0;

	size_t localWorkSize_Kernel1[2], globalWorkSize_Kernel1[2];
	size_t localWorkSize_Kernel2[2], globalWorkSize_Kernel2[2];
	size_t localWorkSize_Kernel3[2], globalWorkSize_Kernel3[2];
	size_t localWorkSize_Kernel4[2], globalWorkSize_Kernel4[2];

	localWorkSize_Kernel1[0] = LWS_KERNEL_1_X;
	localWorkSize_Kernel1[1] = LWS_KERNEL_1_Y;
	globalWorkSize_Kernel1[0] = (size_t)ceil(((float)M) / ((float)LWS_KERNEL_1_X)) * LWS_KERNEL_1_X;
	globalWorkSize_Kernel1[1] = 1;

	localWorkSize_Kernel2[0] = LWS_KERNEL_2_X;
	localWorkSize_Kernel2[1] = LWS_KERNEL_2_Y;
	globalWorkSize_Kernel2[0] = (size_t)ceil(((float)M) / ((float)LWS_KERNEL_2_X)) * LWS_KERNEL_2_X;
	globalWorkSize_Kernel2[1] = 1;

	localWorkSize_Kernel3[0] = LWS_KERNEL_3_X;
	localWorkSize_Kernel3[1] = LWS_KERNEL_3_Y;
	globalWorkSize_Kernel3[0] = (size_t)ceil(((float)M) / ((float)LWS_KERNEL_3_X)) * LWS_KERNEL_3_X;
	globalWorkSize_Kernel3[1] = (size_t)ceil(((float)N) / ((float)LWS_KERNEL_3_Y)) * LWS_KERNEL_3_Y;

	localWorkSize_Kernel4[0] = LWS_KERNEL_4_X;
	localWorkSize_Kernel4[1] = LWS_KERNEL_4_Y;
	globalWorkSize_Kernel4[0] = (size_t)ceil(((float)M) / ((float)LWS_KERNEL_4_X)) * LWS_KERNEL_4_X;
	globalWorkSize_Kernel4[1] = 1;


//	t_start = rtclock();	
	
	// Set the arguments of the kernel
	err_code =  clSetKernelArg(clKernel_mean, 0, sizeof(cl_mem), (void *)&mean_mem_obj);
	err_code |= clSetKernelArg(clKernel_mean, 1, sizeof(cl_mem), (void *)&data_mem_obj);
	err_code |= clSetKernelArg(clKernel_mean, 2, sizeof(DATA_TYPE), (void *)&float_n);
	err_code |= clSetKernelArg(clKernel_mean, 3, sizeof(int), (void *)&m);
	err_code |= clSetKernelArg(clKernel_mean, 4, sizeof(int), (void *)&n);
	if(err_code != CL_SUCCESS)
        {
          printf("Error in seting arguments1\n");
          exit(1);
        }

	// Execute the OpenCL kernel
	err_code = clEnqueueNDRangeKernel(clCommandQue, clKernel_mean, 1, NULL, globalWorkSize_Kernel1, localWorkSize_Kernel1, 0, NULL, NULL);
	if(err_code != CL_SUCCESS)
        {
          printf("Error in launching kernel1\n");
          exit(1);
        }

	clEnqueueBarrier(clCommandQue);

	// Set the arguments of the kernel
	err_code =  clSetKernelArg(clKernel_std, 0, sizeof(cl_mem), (void *)&mean_mem_obj);
	err_code =  clSetKernelArg(clKernel_std, 1, sizeof(cl_mem), (void *)&stddev_mem_obj);
	err_code |= clSetKernelArg(clKernel_std, 2, sizeof(cl_mem), (void *)&data_mem_obj);
	err_code |= clSetKernelArg(clKernel_std, 3, sizeof(DATA_TYPE), (void *)&float_n);
	err_code |= clSetKernelArg(clKernel_std, 4, sizeof(DATA_TYPE), (void *)&eps);
	err_code |= clSetKernelArg(clKernel_std, 5, sizeof(int), (void *)&m);
	err_code |= clSetKernelArg(clKernel_std, 6, sizeof(int), (void *)&n);
	if(err_code != CL_SUCCESS)
        {
          printf("Error in seting arguments2\n");
          exit(1);
        }
 
	// Execute the OpenCL kernel
	err_code = clEnqueueNDRangeKernel(clCommandQue, clKernel_std, 1, NULL, globalWorkSize_Kernel2, localWorkSize_Kernel2, 0, NULL, NULL);
	if(err_code != CL_SUCCESS)
        {
          printf("Error in launching kernel2\n");
          exit(1);
        }

	clEnqueueBarrier(clCommandQue);

	// Set the arguments of the kernel
	err_code =  clSetKernelArg(clKernel_reduce, 0, sizeof(cl_mem), (void *)&mean_mem_obj);
	err_code =  clSetKernelArg(clKernel_reduce, 1, sizeof(cl_mem), (void *)&stddev_mem_obj);
	err_code |= clSetKernelArg(clKernel_reduce, 2, sizeof(cl_mem), (void *)&data_mem_obj);
	err_code |= clSetKernelArg(clKernel_reduce, 3, sizeof(DATA_TYPE), (void *)&float_n);
	err_code |= clSetKernelArg(clKernel_reduce, 4, sizeof(int), (void *)&m);
	err_code |= clSetKernelArg(clKernel_reduce, 5, sizeof(int), (void *)&n);
	if(err_code != CL_SUCCESS) 
        {
          printf("Error in seting arguments3\n");
          exit(1);
        }
 
	// Execute the OpenCL kernel
	err_code = clEnqueueNDRangeKernel(clCommandQue, clKernel_reduce, 2, NULL, globalWorkSize_Kernel3, localWorkSize_Kernel3, 0, NULL, NULL);
	if(err_code != CL_SUCCESS)
        {
          printf("Error in launching kernel3\n");
          exit(1);
        }

	clEnqueueBarrier(clCommandQue);

	// Set the arguments of the kernel	
	err_code =  clSetKernelArg(clKernel_corr, 0, sizeof(cl_mem), (void *)&symmat_mem_obj);
	err_code |= clSetKernelArg(clKernel_corr, 1, sizeof(cl_mem), (void *)&data_mem_obj);
	err_code |= clSetKernelArg(clKernel_corr, 2, sizeof(int), (void *)&m);
	err_code |= clSetKernelArg(clKernel_corr, 3, sizeof(int), (void *)&n);
	if(err_code != CL_SUCCESS)
        {
          printf("Error in seting arguments4\n");
          exit(1);
        }

	// Execute the OpenCL kernel
	err_code = clEnqueueNDRangeKernel(clCommandQue, clKernel_corr, 1, NULL, globalWorkSize_Kernel4, localWorkSize_Kernel4, 0, NULL, NULL);
	if(err_code != CL_SUCCESS)
        {
          printf("Error in launching kernel4\n");
          exit(1);
        }

	clEnqueueBarrier(clCommandQue);

	clEnqueueWriteBuffer(clCommandQue, symmat_mem_obj, CL_TRUE, ((M)*(M+1) + (M))*sizeof(DATA_TYPE), sizeof(DATA_TYPE), &val, 0, NULL, NULL);

	clFinish(clCommandQue);

//	t_end = rtclock();
//	fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start);
}
static cl_int runSummarization(CLInfo* ci,
                               SeparationCLMem* cm,
                               const IntegralArea* ia,
                               cl_uint which,
                               Kahan* resultOut)
{
    cl_int err = CL_SUCCESS;
    cl_mem buf;
    cl_uint offset;
    size_t global[1];
    size_t local[1];
    real result[2] = { -1.0, -1.0 };
    cl_uint nElements = ia->r_steps * ia->mu_steps;
    cl_mem sumBufs[2] = { cm->summarizationBufs[0], cm->summarizationBufs[1] };

    if (which == 0)
    {
        buf = cm->outBg;
        offset = 0;
    }
    else
    {
        buf = cm->outStreams;
        offset = (which - 1) * nElements;
    }


    /* First call reads from an offset into one of the output buffers */
    err |= clSetKernelArg(_summarizationKernel, 0, sizeof(cl_mem), &sumBufs[0]);
    err |= clSetKernelArg(_summarizationKernel, 1, sizeof(cl_mem), &buf);
    err |= clSetKernelArg(_summarizationKernel, 2, sizeof(cl_uint), &nElements);
    err |= clSetKernelArg(_summarizationKernel, 3, sizeof(cl_uint), &offset);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error setting summarization kernel arguments");
        return err;
    }

    local[0] = _summarizationWorkgroupSize;
    global[0] = mwNextMultiple(local[0], nElements);

    err = clEnqueueNDRangeKernel(ci->queue, _summarizationKernel, 1,
                                 NULL, global, local,
                                 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error enqueuing summarization kernel");
        return err;
    }

    /* Why is this necessary? It seems to frequently break on the 7970 and nowhere else without it */
    err = clFinish(ci->queue);
    //err = clFlush(ci->queue);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error finishing summarization kernel");
        return err;
    }

    /* Later calls swap between summarization buffers without an offset */
    nElements = (cl_uint) mwDivRoundup(global[0], local[0]);
    offset = 0;
    err |= clSetKernelArg(_summarizationKernel, 3, sizeof(cl_uint), &offset);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error setting summarization kernel offset argument");
        return err;
    }

    while (nElements > 1)
    {
        /* Swap old summarization buffer to the input and shrink the range */
        swapBuffers(sumBufs);

        global[0] = mwNextMultiple(local[0], nElements);

        err |= clSetKernelArg(_summarizationKernel, 0, sizeof(cl_mem), &sumBufs[0]);
        err |= clSetKernelArg(_summarizationKernel, 1, sizeof(cl_mem), &sumBufs[1]);
        err |= clSetKernelArg(_summarizationKernel, 2, sizeof(cl_uint), &nElements);
        if (err != CL_SUCCESS)
        {
            mwPerrorCL(err, "Error setting summarization kernel arguments");
            return err;
        }

        /*
        err = clEnqueueBarrier(ci->queue);
        if (err != CL_SUCCESS)
        {
            mwPerrorCL(err, "Error enqueuing summarization barrier");
            return err;
        }
        */

        err = clEnqueueNDRangeKernel(ci->queue, _summarizationKernel, 1,
                                     NULL, global, local,
                                     0, NULL, NULL);
        if (err != CL_SUCCESS)
        {
            mwPerrorCL(err, "Error enqueuing summarization kernel");
            return err;
        }

        err = clFinish(ci->queue);
        if (err != CL_SUCCESS)
        {
            mwPerrorCL(err, "Error finishing summarization kernel");
            return err;
        }

        nElements = (cl_uint) mwDivRoundup(global[0], local[0]);
    }


    err = clEnqueueBarrier(ci->queue);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error enqueuing summarization barrier");
        return err;
    }

    err = clEnqueueReadBuffer(ci->queue, sumBufs[0], CL_TRUE,
                              0, 2 * sizeof(real), result,
                              0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        mwPerrorCL(err, "Error reading summarization result buffer");
        return err;
    }

    resultOut->sum = result[0];
    resultOut->correction = result[1];

    return CL_SUCCESS;
}
示例#14
0
/*!
    Adds a barrier to the active command queue.  All commands that
    were queued before this point must finish before any further
    commands added after this point are executed.

    This function will return immediately and will not block waiting
    for the commands to finish.  Use sync() to block until all queued
    commands finish.

    \sa marker(), sync()
*/
void QCLContext::barrier()
{
    cl_int error = clEnqueueBarrier(activeQueue());
    reportError("QCLContext::barrier:", error);
}
示例#15
0
END_TEST

START_TEST (test_misc_events)
{
    cl_platform_id platform = 0;
    cl_device_id device;
    cl_context ctx;
    cl_command_queue queue;
    cl_int result;
    cl_event uevent1, uevent2, marker1, marker2;

    result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
    fail_if(
        result != CL_SUCCESS,
        "unable to get the default device"
    );

    ctx = clCreateContext(0, 1, &device, 0, 0, &result);
    fail_if(
        result != CL_SUCCESS || ctx == 0,
        "unable to create a valid context"
    );

    queue = clCreateCommandQueue(ctx, device, 0, &result);
    fail_if(
        result != CL_SUCCESS || queue == 0,
        "cannot create a command queue"
    );

    /*
     * This test will build a command queue blocked by an user event. The events
     * will be in this order :
     *
     * -: UserEvent1
     * 0: WaitForEvents1 (wait=UserEvent1)
     * 1: Marker1
     * -: UserEvent2
     * 2: WaitForEvents2 (wait=UserEvent2)
     * 3: Barrier
     * 4: Marker2 (to check the barrier worked)
     *
     * When the command queue is built, we :
     *  - Check that Marker1 is Queued (WaitForEvents waits)
     *  - Set UserEvent1 to Complete
     *  - Check that Marker1 is Complete (WaitForEvents stopped to wait)
     *  - Check that Marker2 is Queued (Barrier is there)
     *  - Set UserEvent2 to Complete
     *  - Check that Marker2 is Complete (no more barrier)
     */
    uevent1 = clCreateUserEvent(ctx, &result);
    fail_if(
        result != CL_SUCCESS,
        "unable to create UserEvent1"
    );

    uevent2 = clCreateUserEvent(ctx, &result);
    fail_if(
        result != CL_SUCCESS,
        "unable to create UserEvent2"
    );

    result = clEnqueueWaitForEvents(queue, 1, &uevent1);
    fail_if(
        result != CL_SUCCESS,
        "unable to enqueue WaitForEvents(UserEvent1)"
    );

    result = clEnqueueMarker(queue, &marker1);
    fail_if(
        result != CL_SUCCESS,
        "unable to enqueue Marker1"
    );

    result = clEnqueueWaitForEvents(queue, 1, &uevent2);
    fail_if(
        result != CL_SUCCESS,
        "unable to enqueue WaitForEvents(UserEvent2)"
    );

    result = clEnqueueBarrier(queue);
    fail_if(
        result != CL_SUCCESS,
        "unable to enqueue Barrier"
    );

    result = clEnqueueMarker(queue, &marker2);
    fail_if(
        result != CL_SUCCESS,
        "unable to enqueue Marker2"
    );

    // Now the checks
    cl_int status;

    result = clGetEventInfo(marker1, CL_EVENT_COMMAND_EXECUTION_STATUS,
                            sizeof(cl_int), &status, 0);
    fail_if(
        result != CL_SUCCESS || status != CL_QUEUED,
        "Marker1 must be Queued"
    );

    result = clSetUserEventStatus(uevent1, CL_COMPLETE);
    fail_if(
        result != CL_SUCCESS,
        "unable to set UserEvent1 to Complete"
    );

    result = clGetEventInfo(marker1, CL_EVENT_COMMAND_EXECUTION_STATUS,
                            sizeof(cl_int), &status, 0);
    fail_if(
        result != CL_SUCCESS || status != CL_COMPLETE,
        "Marker1 must be Complete"
    );

    result = clGetEventInfo(marker2, CL_EVENT_COMMAND_EXECUTION_STATUS,
                            sizeof(cl_int), &status, 0);
    fail_if(
        result != CL_SUCCESS || status != CL_QUEUED,
        "Marker2 must be Queued"
    );

    result = clSetUserEventStatus(uevent2, CL_COMPLETE);
    fail_if(
        result != CL_SUCCESS,
        "unable to set UserEvent2 to Complete"
    );

    result = clGetEventInfo(marker2, CL_EVENT_COMMAND_EXECUTION_STATUS,
                            sizeof(cl_int), &status, 0);
    fail_if(
        result != CL_SUCCESS || status != CL_COMPLETE,
        "Marker2 must be Complete"
    );

    clFinish(queue);

    clReleaseEvent(uevent1);
    clReleaseEvent(uevent2);
    clReleaseEvent(marker1);
    clReleaseEvent(marker2);
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);
}
示例#16
0
void cl_launch_kernel()
{
	int m = M;
	int n = N;

  size_t oldLocalWorkSize[2], globalWorkSizeKernel1[2], localWorkSize[2]; 
  size_t globalWorkSizeKernel2[2], globalWorkSizeKernel3[2];

  oldLocalWorkSize[0] = DIM_THREAD_BLOCK_X;
	oldLocalWorkSize[1] = DIM_THREAD_BLOCK_Y;
	globalWorkSizeKernel1[0] = DIM_THREAD_BLOCK_X;
	globalWorkSizeKernel1[1] = DIM_THREAD_BLOCK_Y;
	globalWorkSizeKernel2[0] = N;
	globalWorkSizeKernel2[1] = 1;
	globalWorkSizeKernel3[0] = N;
	globalWorkSizeKernel3[1] = 1;

  ///////////////////////////////////////////////
  // Kernel 2.
  getNewSizes(NULL, oldLocalWorkSize, NULL, localWorkSize, "gramschmidt_kernel2", 2);
  // Kernel 3.
  getNewSizes(NULL, localWorkSize, NULL, localWorkSize, "gramschmidt_kernel3", 2);

  ///////////////////////////////////////////////

	int k;
	for (k = 0; k < 1; k++)
	{
		// Set the arguments of the kernel
		errcode =  clSetKernelArg(clKernel1, 0, sizeof(cl_mem), (void *)&a_mem_obj);
		errcode =  clSetKernelArg(clKernel1, 1, sizeof(cl_mem), (void *)&r_mem_obj);
		errcode |= clSetKernelArg(clKernel1, 2, sizeof(cl_mem), (void *)&q_mem_obj);
		errcode |= clSetKernelArg(clKernel1, 3, sizeof(int), (void *)&k);
		errcode |= clSetKernelArg(clKernel1, 4, sizeof(int), (void *)&m);
		errcode |= clSetKernelArg(clKernel1, 5, sizeof(int), (void *)&n);
	
		if(errcode != CL_SUCCESS) printf("Error in seting arguments1\n");
	
		// Execute the OpenCL kernel
		errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel1, 1, NULL, globalWorkSizeKernel1, localWorkSize, 0, NULL, NULL);
		if(errcode != CL_SUCCESS) printf("Error in launching kernel1\n");
		clEnqueueBarrier(clCommandQue);


		errcode =  clSetKernelArg(clKernel2, 0, sizeof(cl_mem), (void *)&a_mem_obj);
		errcode =  clSetKernelArg(clKernel2, 1, sizeof(cl_mem), (void *)&r_mem_obj);
		errcode |= clSetKernelArg(clKernel2, 2, sizeof(cl_mem), (void *)&q_mem_obj);
		errcode |= clSetKernelArg(clKernel2, 3, sizeof(int), (void *)&k);
		errcode |= clSetKernelArg(clKernel2, 4, sizeof(int), (void *)&m);
		errcode |= clSetKernelArg(clKernel2, 5, sizeof(int), (void *)&n);
	
		if(errcode != CL_SUCCESS) printf("Error in seting arguments1\n");
	
		// Execute the OpenCL kernel
		errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel2, 1, NULL, globalWorkSizeKernel2, localWorkSize, 0, NULL, NULL);
		if(errcode != CL_SUCCESS) printf("Error in launching kernel2\n");
		clEnqueueBarrier(clCommandQue);


		errcode =  clSetKernelArg(clKernel3, 0, sizeof(cl_mem), (void *)&a_mem_obj);
		errcode =  clSetKernelArg(clKernel3, 1, sizeof(cl_mem), (void *)&r_mem_obj);
		errcode |= clSetKernelArg(clKernel3, 2, sizeof(cl_mem), (void *)&q_mem_obj);
		errcode |= clSetKernelArg(clKernel3, 3, sizeof(int), (void *)&k);
		errcode |= clSetKernelArg(clKernel3, 4, sizeof(int), (void *)&m);
		errcode |= clSetKernelArg(clKernel3, 5, sizeof(int), (void *)&n);
	
		if(errcode != CL_SUCCESS) printf("Error in seting arguments1\n");
	
		// Execute the OpenCL kernel
		errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel3, 1, NULL, globalWorkSizeKernel3, localWorkSize, 0, NULL, NULL);
		if(errcode != CL_SUCCESS) printf("Error in launching kernel3\n");
		clEnqueueBarrier(clCommandQue);

	}
	clFinish(clCommandQue);

}
示例#17
0
double gpu_cgm_image(uint32_t* aList, uint32_t* bList, int aLength,
		int bLength, int keyLength, uint32_t** matches, char* clFile, int x,
		int y) {
	int gap = 0, myoffset = 0;
	cl_platform_id *platforms;
	cl_uint num_platforms = 0;
	cl_device_id *devices;
	cl_uint num_devices = 0;
	cl_context context;
	cl_command_queue command_queue;
	cl_image_format imgFormat;
	cl_mem aImg;
	cl_mem bImg;
	cl_mem res_buf;
	cl_program program;
	cl_kernel kernel;
	cl_uint *results;
	FILE *prgm_fptr;
	struct stat prgm_sbuf;
	char *prgm_data;
	size_t prgm_size;
	size_t offset;
	size_t count;
	const size_t global_work_size[] = { x, y };
	const size_t origin[] = { 0, 0, 0 };
	const size_t region[] = { aLength, 1, 1 };

	cl_int ret;
	cl_uint i;

	cl_bool imageSupport;

	struct timeval t1, t2;
	double elapsedTime;

	results = malloc(sizeof(cl_uint) * aLength);

	imgFormat.image_channel_order = CL_RGBA;
	imgFormat.image_channel_data_type = CL_UNSIGNED_INT32;

	/* figure out how many CL platforms are available */
	ret = clGetPlatformIDs(0, NULL, &num_platforms);
	if (CL_SUCCESS != ret) {
		print_error ("Error getting the number of platform IDs: %d", ret);
		exit(EXIT_FAILURE);
	}

	if (0 == num_platforms) {
		print_error ("No CL platforms were found.");
		exit(EXIT_FAILURE);
	}

	/* allocate space for each available platform ID */
	if (NULL == (platforms = malloc((sizeof *platforms) * num_platforms))) {
		print_error ("Out of memory");
		exit(EXIT_FAILURE);
	}

	/* get all of the platform IDs */
	ret = clGetPlatformIDs(num_platforms, platforms, NULL);
	if (CL_SUCCESS != ret) {
		print_error ("Error getting platform IDs: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* find a platform that supports given device type */
	//	print_error ("Number of platforms found: %d", num_platforms);
	for (i = 0; i < num_platforms; i++) {
		ret = clGetDeviceIDs(platforms[i], getDeviceType(), 0, NULL,
				&num_devices);
		if (CL_SUCCESS != ret)
			continue;

		if (0 < num_devices)
			break;
	}

	/* make sure at least one device was found */
	if (num_devices == 0) {
		print_error ("No CL device found that supports device type: %s.", ((getDeviceType() == CL_DEVICE_TYPE_CPU) ? "CPU" : "GPU"));
		exit(EXIT_FAILURE);
	}

	/* only one device is necessary... */
	num_devices = 1;
	if (NULL == (devices = malloc((sizeof *devices) * num_devices))) {
		print_error ("Out of memory");
		exit(EXIT_FAILURE);
	}

	/* get one device id */
	ret = clGetDeviceIDs(platforms[i], getDeviceType(), num_devices,
			devices, NULL);
	if (CL_SUCCESS != ret) {
		print_error ("Error getting device IDs: %d", ret);
		exit(EXIT_FAILURE);
	}

	ret = clGetDeviceInfo(*devices, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &imageSupport, NULL);
	if (CL_SUCCESS != ret) {
			print_error ("Failed to get Device Info: %d", ret);
			exit(EXIT_FAILURE);
		}

	if(imageSupport == CL_FALSE)
	{
		print_error ("Failure: Images are not supported!");
				exit(EXIT_FAILURE);
	}

	/* create a context for the CPU device that was found earlier */
	context = clCreateContext(NULL, num_devices, devices, NULL, NULL, &ret);
	if (NULL == context || CL_SUCCESS != ret) {
		print_error ("Failed to create context: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* create a command queue for the CPU device */
	command_queue = clCreateCommandQueue(context, devices[0], 0, &ret);
	if (NULL == command_queue || CL_SUCCESS != ret) {
		print_error ("Failed to create a command queue: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* create buffers on the CL device */
	aImg = clCreateImage2D(context, CL_MEM_READ_ONLY, &imgFormat, aLength, 1, 0, NULL, &ret);
	if (NULL == aImg || CL_SUCCESS != ret) {
		print_error ("Failed to create a image: %d", ret);
		exit(EXIT_FAILURE);
	}

	bImg = clCreateImage2D(context, CL_MEM_READ_ONLY, &imgFormat, aLength, 1, 0, NULL, &ret);
	if (NULL == bImg || CL_SUCCESS != ret) {
		print_error ("Failed to create b image: %d", ret);
		exit(EXIT_FAILURE);
	}

	int res_bufSize = aLength;

	res_buf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint)
			* res_bufSize, NULL, &ret);
	if (NULL == res_buf || CL_SUCCESS != ret) {
		print_error ("Failed to create b buffer: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* read the opencl program code into a string */
	prgm_fptr = fopen(clFile, "r");
	if (NULL == prgm_fptr) {
		print_error ("%s", strerror (errno));
		exit(EXIT_FAILURE);
	}

	if (0 != stat(clFile, &prgm_sbuf)) {
		print_error ("%s", strerror (errno));
		exit(EXIT_FAILURE);
	}
	prgm_size = prgm_sbuf.st_size;

	prgm_data = malloc(prgm_size);
	if (NULL == prgm_data) {
		print_error ("Out of memory");
		exit(EXIT_FAILURE);
	}

	/* make sure all data is read from the file (just in case fread returns
	 * short) */
	offset = 0;
	while (prgm_size - offset != (count = fread(prgm_data + offset, 1,
			prgm_size - offset, prgm_fptr)))
		offset += count;

	if (0 != fclose(prgm_fptr)) {
		print_error ("%s", strerror (errno));
		exit(EXIT_FAILURE);
	}

	/* create a 'program' from the source */
	program = clCreateProgramWithSource(context, 1, (const char **) &prgm_data,
			&prgm_size, &ret);
	if (NULL == program || CL_SUCCESS != ret) {
		print_error ("Failed to create program with source: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* compile the program.. (it uses llvm or something) */
	ret = clBuildProgram(program, num_devices, devices, NULL, NULL, NULL);
	if (CL_SUCCESS != ret) {
		size_t size;
		char *log = calloc(1, 4000);
		if (NULL == log) {
			print_error ("Out of memory");
			exit(EXIT_FAILURE);
		}

		print_error ("Failed to build program: %d", ret);
		ret = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG,
				4096, log, &size);
		if (CL_SUCCESS != ret) {
			print_error ("Failed to get program build info: %d", ret);
			exit(EXIT_FAILURE);
		}

		fprintf(stderr, "Begin log:\n%s\nEnd log.\n", log);
		exit(EXIT_FAILURE);
	}

	/* pull out a reference to your kernel */
	kernel = clCreateKernel(program, "cgm_kernel", &ret);
	if (NULL == kernel || CL_SUCCESS != ret) {
		print_error ("Failed to create kernel: %d", ret);
		exit(EXIT_FAILURE);
	}

	gettimeofday(&t1, NULL);

	/* write data to these buffers */
	clEnqueueWriteImage(command_queue, aImg, CL_FALSE, origin, region, 0, 0,
			(void*) aImg, 0, NULL, NULL);
	clEnqueueWriteImage(command_queue, bImg, CL_FALSE, origin, region, 0, 0,
			(void*) bImg, 0, NULL, NULL);

	/* set your kernel's arguments */
	ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &aImg);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to set kernel argument: %d", ret);
		exit(EXIT_FAILURE);
	}
	ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bImg);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to set kernel argument: %d", ret);
		exit(EXIT_FAILURE);
	}

	ret = clSetKernelArg(kernel, 4, sizeof(int), &gap);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to set kernel argument: %d", ret);
		exit(EXIT_FAILURE);
	}
	ret = clSetKernelArg(kernel, 5, sizeof(int), &myoffset);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to set kernel argument: %d", ret);
		exit(EXIT_FAILURE);
	}

	ret = clSetKernelArg(kernel, 6, sizeof(int), &keyLength);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to set kernel argument: %d", ret);
		exit(EXIT_FAILURE);
	}
	ret = clSetKernelArg(kernel, 7, sizeof(cl_mem), &res_buf);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to set kernel argument: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* make sure buffers have been written before executing */
	ret = clEnqueueBarrier(command_queue);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to enqueue barrier: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* enque this kernel for execution... */
	ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL,
			global_work_size, NULL, 0, NULL, NULL);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to enqueue kernel: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* wait for the kernel to finish executing */
	ret = clEnqueueBarrier(command_queue);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to enqueue barrier: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* copy the contents of dev_buf from the CL device to the host (CPU) */
	ret = clEnqueueReadBuffer(command_queue, res_buf, true, 0, sizeof(cl_uint)
			* aLength, results, 0, NULL, NULL);

	gettimeofday(&t2, NULL);
	elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0; // sec to ms
	elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0; // us to ms

	if (CL_SUCCESS != ret) {
		print_error ("Failed to copy data from device to host: %d", ret);
		exit(EXIT_FAILURE);
	}

	ret = clEnqueueBarrier(command_queue);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to enqueue barrier: %d", ret);
		exit(EXIT_FAILURE);
	}

	/* make sure the content of the buffer are what we expect */
	//for (i = 0; i < aLength; i++)
	//	printf("%d\n", results[i]);

	/* free up resources */
	ret = clReleaseKernel(kernel);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to release kernel: %d", ret);
		exit(EXIT_FAILURE);
	}

	ret = clReleaseProgram(program);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to release program: %d", ret);
		exit(EXIT_FAILURE);
	}

	ret = clReleaseMemObject(aImg);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to release memory object: %d", ret);
		exit(EXIT_FAILURE);
	}
	ret = clReleaseMemObject(bImg);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to release memory object: %d", ret);
		exit(EXIT_FAILURE);
	}

	ret = clReleaseMemObject(res_buf);
	if (CL_SUCCESS != ret) {
		print_error ("Failed to release memory object: %d", ret);
		exit(EXIT_FAILURE);
	}

	if (CL_SUCCESS != (ret = clReleaseCommandQueue(command_queue))) {
		print_error ("Failed to release command queue: %d", ret);
		exit(EXIT_FAILURE);
	}

	if (CL_SUCCESS != (ret = clReleaseContext(context))) {
		print_error ("Failed to release context: %d", ret);
		exit(EXIT_FAILURE);
	}

	matches = &results;
	return elapsedTime;
}