Пример #1
0
	virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/)
	{
		cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
		/* Use small global size on CPU devices as it seems to be much faster. */
		if(type == CL_DEVICE_TYPE_CPU) {
			VLOG(1) << "Global size: (64, 64).";
			return make_int2(64, 64);
		}

		cl_ulong max_buffer_size;
		clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);

		if(DebugFlags().opencl.mem_limit) {
			max_buffer_size = min(max_buffer_size,
			                      cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
		}

		VLOG(1) << "Maximum device allocation size: "
		        << string_human_readable_number(max_buffer_size) << " bytes. ("
		        << string_human_readable_size(max_buffer_size) << ").";

		/* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */
		max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l*1024*1024*1024);

		size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size);
		int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements));
		VLOG(1) << "Global size: " << global_size << ".";
		return global_size;
	}
Пример #2
0
//------------------------------------------------------------------------------
double timeEnqueueNDRangeKernel(cl_command_queue command_queue,
                                cl_kernel kernel,
                                cl_uint work_dim,
                                const size_t *global_work_offset,
                                const size_t *global_work_size,
                                const size_t *local_work_size,
                                cl_uint num_events_in_wait_list,
                                const cl_event *event_wait_list) {
    cl_int status = clFinish(command_queue);
    check_cl_error(status, "clFinish");
    cl_event profilingEvent;
    status = clEnqueueNDRangeKernel(command_queue,
                                    kernel,              
                                    work_dim,
                                    global_work_offset,
                                    global_work_size,
                                    local_work_size,
                                    num_events_in_wait_list,
                                    event_wait_list,
                                    &profilingEvent);
    check_cl_error(status, "clEnqueueNDRangeKernel");
    status = clFinish(command_queue); //ensure kernel execution is
    //terminated; used for timing purposes only; there is no need to enforce
    //termination when issuing a subsequent blocking data transfer operation
    check_cl_error(status, "clFinish");
    status = clWaitForEvents(1, &profilingEvent);
    check_cl_error(status, "clWaitForEvents");
    cl_ulong kernelStartTime = cl_ulong(0);
    cl_ulong kernelEndTime   = cl_ulong(0);
    size_t retBytes = size_t(0);
    double kernelElapsedTime_ms = double(0); 
   
    status = clGetEventProfilingInfo(profilingEvent,
                                     CL_PROFILING_COMMAND_QUEUED,
                                     sizeof(cl_ulong),
                                     &kernelStartTime, &retBytes);
    check_cl_error(status, "clGetEventProfilingInfo");
    status = clGetEventProfilingInfo(profilingEvent,
                                     CL_PROFILING_COMMAND_END,
                                     sizeof(cl_ulong),
                                     &kernelEndTime, &retBytes);
    check_cl_error(status, "clGetEventProfilingInfo");
    //event timing is reported in nano seconds: divide by 1e6 to get
    //time in milliseconds
    kernelElapsedTime_ms =  (double)(kernelEndTime - kernelStartTime) / 1E6;
    return kernelElapsedTime_ms;
}
Пример #3
0
//------------------------------------------------------------------------------
double get_cl_time(cl_event ev) {
    cl_ulong startTime = cl_ulong(0);
    cl_ulong endTime   = cl_ulong(0);
    size_t retBytes = size_t(0);
    cl_int status = clGetEventProfilingInfo(ev,
                   CL_PROFILING_COMMAND_QUEUED,
                   sizeof(cl_ulong),
                   &startTime, &retBytes);
    check_cl_error(status, "clGetEventProfilingInfo");
    status = clGetEventProfilingInfo(ev,
                   CL_PROFILING_COMMAND_END,
                   sizeof(cl_ulong),
                   &endTime, &retBytes);
    check_cl_error(status, "clGetEventProfilingInfo");
    //event timing is reported in nanoseconds: divide by 1e6 to get
    //time in milliseconds
    return double((endTime - startTime) / 1E6);    
}
Пример #4
0
void DebugParticleSystem::update_impl(float time_step)
{
    if (m_cl_timer.isReady()) m_cl_timer.start();

    size_t part_cnt = m_sim_params->particleCount();
    float sim_time = m_sim_params->time();

    if (m_spiral)
        m_polar_spiral_kernel.set_arg(3, cl_ulong(time(nullptr) + sim_time));
    else
        m_gen_rand_particles_kernel.set_arg(2, cl_ulong(time(nullptr) + sim_time));

    cl_mem buffers[] = { m_particle_pos_buf.getCLID(), m_particle_col_buf.getCLID() };

    utils::ocl::GLSyncHandler sync(m_queue, sizeof(buffers) / sizeof(buffers[0]), buffers);
    if (!sync) return;

    cl_int err = CL_SUCCESS;

    if (m_spiral)
    {
        err = clEnqueueNDRangeKernel(m_queue, m_polar_spiral_kernel, 1,
                                     nullptr, &part_cnt, nullptr,
                                     0, nullptr, m_stats.event("polar_spiral"));
    }
    else
    {
        err = clEnqueueNDRangeKernel(m_queue, m_gen_rand_particles_kernel, 1,
                                     nullptr, &part_cnt, nullptr,
                                     0, nullptr, m_stats.event("gen_part_positions"));
    }

    if (err != CL_SUCCESS)
    {
        WARNM("DebugParticleSystem: Failed to enqueue test simulation kernel: "
              << utils::ocl::errorToStr(err));
    }

    m_sim_params->advanceTime(time_step);  // 3.0f;

    if (m_cl_timer.isReady()) m_cl_timer.stop();
}
Пример #5
0
 cl_ulong eventProfilingInfo(cl_event id, cl_profiling_info info)
 {
     cl_ulong value;
     cl_int error = CL_SUCCESS;
     if(!id || (error = clGetEventProfilingInfo(id, info, 
             sizeof(cl_ulong), &value, nullptr)) != CL_SUCCESS)
     {
         reportError("eventInfo(): ", error);
         return cl_ulong(0);
     }
     return value;
 }
//------------------------------------------------------------------------------
int main(int argc, char** argv) {
    if(argc < 8) {
        std::cerr << "usage: " << argv[0]
                  << " <platform name> <device type = default | cpu | gpu "
                     "| acc | all>  <device num> <OpenCL source file path>"
                     " <kernel name> <matrix size> <workgroup size>"
                  << std::endl;
        exit(EXIT_FAILURE);   
    }
    const int SIZE = atoi(argv[6]);
    const size_t BYTE_SIZE = SIZE * SIZE * sizeof(real_t);
    const int BLOCK_SIZE = atoi(argv[7]); //4 x 4 tiles
    if( SIZE < 1 || BLOCK_SIZE < 1 || (SIZE % BLOCK_SIZE) != 0) {
    	std::cerr << "ERROR - size and block size *must* be greater than zero "
    	             "and size *must* be evenly divsible by block size"
    	          << std::endl;
    	exit(EXIT_FAILURE);
    }
    //setup text header that will be prefixed to opencl code
    std::ostringstream clheaderStream;
    clheaderStream << "#define BLOCK_SIZE " << BLOCK_SIZE << '\n';
#ifdef USE_DOUBLE    
    clheaderStream << "#define DOUBLE\n";
    const double EPS = 0.000000001;
#else
    const double EPS = 0.00001;
#endif
    //enable profiling on queue    
    CLEnv clenv = create_clenv(argv[1], argv[2], atoi(argv[3]), true,
                               argv[4], argv[5], clheaderStream.str());
   
    cl_int status;
    //create input and output matrices
    std::vector<real_t> A = create_matrix(SIZE, SIZE);
    std::vector<real_t> B = create_matrix(SIZE, SIZE);
    std::vector<real_t> C(SIZE * SIZE,real_t(0));
    std::vector<real_t> refC(SIZE * SIZE,real_t(0));        
    
    //allocate output buffer on OpenCL device
    cl_mem devC = clCreateBuffer(clenv.context,
                                 CL_MEM_WRITE_ONLY,
                                 BYTE_SIZE,
                                 0,
                                 &status);
    check_cl_error(status, "clCreateBuffer");

    //allocate input buffers on OpenCL devices and copy data
    cl_mem devA = clCreateBuffer(clenv.context,
                                 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                 BYTE_SIZE,
                                 &A[0], //<-- copy data from A
                                 &status);
    check_cl_error(status, "clCreateBuffer");                              
    cl_mem devB = clCreateBuffer(clenv.context,
                                 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                 BYTE_SIZE,
                                 &B[0], //<-- copy data from B
                                 &status);
    check_cl_error(status, "clCreateBuffer");                              

    //set kernel parameters
    status = clSetKernelArg(clenv.kernel, //kernel
                            0,      //parameter id
                            sizeof(cl_mem), //size of parameter
                            &devA); //pointer to parameter
    check_cl_error(status, "clSetKernelArg(A)");
    status = clSetKernelArg(clenv.kernel, //kernel
                            1,      //parameter id
                            sizeof(cl_mem), //size of parameter
                            &devB); //pointer to parameter
    check_cl_error(status, "clSetKernelArg(B)");
    status = clSetKernelArg(clenv.kernel, //kernel
                            2,      //parameter id
                            sizeof(cl_mem), //size of parameter
                            &devC); //pointer to parameter
    check_cl_error(status, "clSetKernelArg(C)");
    status = clSetKernelArg(clenv.kernel, //kernel
                            3,      //parameter id
                            sizeof(int), //size of parameter
                            &SIZE); //pointer to parameter
    check_cl_error(status, "clSetKernelArg(SIZE)");


    //setup kernel launch configuration
    //total number of threads == number of array elements
    const size_t globalWorkSize[2] = {SIZE, SIZE};
    //number of per-workgroup local threads
    const size_t localWorkSize[2] = {BLOCK_SIZE, BLOCK_SIZE}; 

    //launch kernel
    //to make sure there are no pending commands in the queue do wait
    //for any commands to finish execution
    status = clFinish(clenv.commandQueue);
    check_cl_error(status, "clFinish");
    cl_event profilingEvent;
    status = clEnqueueNDRangeKernel(clenv.commandQueue, //queue
                                    clenv.kernel, //kernel                                   
                                    2, //number of dimensions for work-items
                                    0, //global work offset
                                    globalWorkSize, //total number of threads
                                    localWorkSize, //threads per workgroup
                                    0, //number of events that need to
                                       //complete before kernel executed
                                    0, //list of events that need to complete
                                       //before kernel executed
                                    &profilingEvent); //event object 
                                                     //identifying this
                                                     //particular kernel
                                                     //execution instance
    check_cl_error(status, "clEnqueueNDRangeKernel");
    status = clFinish(clenv.commandQueue); //ensure kernel execution is
    //terminated; used for timing purposes only; there is no need to enforce
    //termination when issuing a subsequent blocking data transfer operation
    check_cl_error(status, "clFinish");
    status = clWaitForEvents(1, &profilingEvent);
    check_cl_error(status, "clWaitForEvents");
    cl_ulong kernelStartTime = cl_ulong(0);
    cl_ulong kernelEndTime   = cl_ulong(0);
    size_t retBytes = size_t(0);
    double kernelElapsedTime_ms = double(0); 
   
    status = clGetEventProfilingInfo(profilingEvent,
									 CL_PROFILING_COMMAND_QUEUED,
									 sizeof(cl_ulong),
                                     &kernelStartTime, &retBytes);
    check_cl_error(status, "clGetEventProfilingInfo");
    status = clGetEventProfilingInfo(profilingEvent,
									 CL_PROFILING_COMMAND_END,
									 sizeof(cl_ulong),
                                     &kernelEndTime, &retBytes);
    check_cl_error(status, "clGetEventProfilingInfo");
    //event timing is reported in nano seconds: divide by 1e6 to get
    //time in milliseconds
    kernelElapsedTime_ms =  (double)(kernelEndTime - kernelStartTime) / 1E6;
    //read back and check results
    status = clEnqueueReadBuffer(clenv.commandQueue,
                                 devC,
                                 CL_TRUE, //blocking read
                                 0, //offset
                                 BYTE_SIZE, //byte size of data
                                 &C[0], //destination buffer in host memory
                                 0, //number of events that need to
                                    //complete before transfer executed
                                 0, //list of events that need to complete
                                    //before transfer executed
                                 0); //event identifying this specific operation
    check_cl_error(status, "clEnqueueReadBuffer");
    
    host_matmul(A, B, refC, SIZE, SIZE);

    if(check_result(refC, C, EPS)) {
    	std::cout << "PASSED" << std::endl;
    	std::cout << "Elapsed time(ms): " << kernelElapsedTime_ms << std::endl;
    } else {
    	std::cout << "FAILED" << std::endl;
    }	

    check_cl_error(clReleaseMemObject(devA), "clReleaseMemObject");
    check_cl_error(clReleaseMemObject(devB), "clReleaseMemObject");
    check_cl_error(clReleaseMemObject(devC), "clReleaseMemObject");
    release_clenv(clenv);
   
    return 0;
}