/** * \brief ocl::Image::write Transfers data from host memory to this Image. * * You can be sure that the data is write. * \param ptr_to_host_data must point to a memory location whith region bytes available. * \param region is the 3D region of the data. It is given with {image_width, image_height, image_depth}. */ void ocl::Image::write(const void *ptr_to_host_data, const size_t *region, const EventList &list) const { TRUE_ASSERT(ptr_to_host_data != NULL, "data == 0"); std::vector<size_t> origin = {0, 0, 0}; OPENCL_SAFE_CALL( clEnqueueWriteImage(this->activeQueue().id(), this->id(), CL_TRUE, origin.data(), region, 0, 0, ptr_to_host_data, list.size(), list.events().data(), NULL) ); OPENCL_SAFE_CALL( clFinish(this->activeQueue().id()) ); }
/** * \brief ocl::Image::writeAsync Transfers data from host memory to this Image. * * Waits until the event list is completed. * \param origin is the 3D offset in bytes from which the Image is read. * \param ptr_to_host_data must point to a memory location whith region bytes available. * \param region is the 3D region of the data. It is given with {image_width, image_height, image_depth}. * \param list contains all events for which this command has to wait. * \return an event which can be further put into an event list for synchronization. */ ocl::Event ocl::Image::writeAsync(size_t *origin, const void *ptr_to_host_data, const size_t *region, const EventList &list) const { TRUE_ASSERT(ptr_to_host_data != NULL, "data == 0"); cl_event event_id; OPENCL_SAFE_CALL( clEnqueueWriteImage(this->activeQueue().id(), this->id(), CL_FALSE, origin, region, 0, 0, ptr_to_host_data, list.size(), list.events().data(), &event_id) ); return ocl::Event(event_id, this->context()); }
void OpenCLImage::write(void *dataPtr, bool blockingWrite, size_t *pOrigin, size_t *pRegion, size_t rowPitch, size_t slicePitch) { if(pOrigin == NULL) pOrigin = origin; if(pRegion == NULL) pRegion = region; cl_int err = clEnqueueWriteImage(pOpenCL->getQueue(), clMemObject, blockingWrite, pOrigin, pRegion, rowPitch, slicePitch, dataPtr, 0, NULL, NULL); assert(err == CL_SUCCESS); }
/// Enqueues a command to write data from host memory to \p image. /// /// \see_opencl_ref{clEnqueueWriteImage} void enqueue_write_image(const image3d &image, const size_t origin[3], const size_t region[3], size_t input_row_pitch, size_t input_slice_pitch, const void *host_ptr, const wait_list &events = wait_list()) { BOOST_ASSERT(m_queue != 0); BOOST_ASSERT(image.get_context() == this->get_context()); cl_int ret = clEnqueueWriteImage( m_queue, image.get(), CL_TRUE, origin, region, input_row_pitch, input_slice_pitch, host_ptr, events.size(), events.get_event_ptr(), 0 ); if(ret != CL_SUCCESS){ BOOST_THROW_EXCEPTION(opencl_error(ret)); } }
void timedImageCLWrite( cl_command_queue queue, cl_mem image, void *ptr ) { CPerfCounter t1; cl_int ret; cl_event ev; t1.Start(); ret = clEnqueueWriteImage( queue, image, CL_FALSE, imageOrigin, imageRegion, 0,0, ptr, 0, NULL, &ev ); ASSERT_CL_RETURN( ret ); clFlush( queue ); spinForEventsComplete( 1, &ev ); t1.Stop(); tlog->Timer( "%32s %lf s %8.2lf GB/s\n", "clEnqueueWriteImage():", t1.GetElapsedTime(), nBytesRegion, 1 ); }
result plane::CopyToAsynch( const unsigned char &host_buffer, const int &host_cols, const int &host_rows, const int &host_pitch, cl_event *event) { if (!valid_) return FILTER_INVALID_PLANE_BUFFER_STATE; valid_ = false; cl_int cl_status = CL_SUCCESS; size_t zero_offset[] = {0, 0, 0}; size_t copy_region[] = {ByPowerOf2(host_cols, 2) >> 2, host_rows, 1}; cl_status = clEnqueueWriteImage(cq_, mem_, CL_FALSE, zero_offset, copy_region, host_pitch, 0, &host_buffer, 0, NULL, event); if (cl_status != CL_SUCCESS) { g_last_cl_error = cl_status; return FILTER_COPYING_TO_PLANE_FAILED; } valid_ = true; return FILTER_OK; }
void transfer(Image * I, int dest) { if(I->locality != dest) { size_t origin[3]; origin[0] = 0; origin[1] = 0; origin[2] = 0; size_t region[3]; region[0] = I->width; region[1] = I->height; region[2] = 1; if(dest == 1) { //if(DEBUG) printf("moving to gpu\n"); check(clEnqueueWriteImage(queue,I->GPU_Image,CL_FALSE,origin, region,0,0,I->CPU_Image,0,NULL,NULL)); I->locality = 1; //if(DEBUG) printf("done\n"); } else { //if(DEBUG) printf("moving to cpu\n"); check(clEnqueueReadImage(queue,I->GPU_Image,CL_FALSE,origin, region, 0,0,I->CPU_Image,0,NULL,NULL)); I->locality = 0; } } }
/** * \brief ocl::Image::write Transfers data from host memory to this Image. * * You can be sure that the data is read. Be sure that the queue * and this Image are in the same context. * \param queue is a command queue on which the command is executed. * \param origin is the 3D offset in bytes from which the Image is read. * \param ptr_to_host_data must point to a memory location whith region bytes available. * \param region is the 3D region of the data. It is given with {image_width, image_height, image_depth}. */ void ocl::Image::write(const Queue& queue, size_t *origin, const void *ptr_to_host_data, const size_t *region, const EventList &list) const { TRUE_ASSERT(ptr_to_host_data != NULL, "data == 0"); TRUE_ASSERT(queue.context() == *this->context(), "Context of queue and this must be equal"); OPENCL_SAFE_CALL( clEnqueueWriteImage(queue.id(), this->id(), CL_TRUE, origin, region, 0, 0, ptr_to_host_data, list.size(), list.events().data(), NULL) ); OPENCL_SAFE_CALL( clFinish(queue.id()) ); }
bool Sobel::runOpenCL(Image input, Image output, const Params& params) { if (!initCL(params, sobel_kernel, "-cl-fast-relaxed-math")) { return false; } cl_int err; cl_kernel kernel; cl_mem d_input, d_output; cl_image_format format = {CL_RGBA, CL_UNORM_INT8}; kernel = clCreateKernel(m_program, "sobel", &err); CHECK_ERROR_OCL(err, "creating kernel", return false); d_input = clCreateImage2D( m_context, CL_MEM_READ_ONLY, &format, input.width, input.height, 0, NULL, &err); CHECK_ERROR_OCL(err, "creating input image", return false); d_output = clCreateImage2D( m_context, CL_MEM_WRITE_ONLY, &format, input.width, input.height, 0, NULL, &err); CHECK_ERROR_OCL(err, "creating output image", return false); size_t origin[3] = {0, 0, 0}; size_t region[3] = {input.width, input.height, 1}; err = clEnqueueWriteImage( m_queue, d_input, CL_TRUE, origin, region, 0, 0, input.data, 0, NULL, NULL); CHECK_ERROR_OCL(err, "writing image data", return false); err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_input); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_output); CHECK_ERROR_OCL(err, "setting kernel arguments", return false); reportStatus("Running OpenCL kernel"); const size_t global[2] = {output.width, output.height}; const size_t *local = NULL; if (params.wgsize[0] && params.wgsize[1]) { local = params.wgsize; } // Timed runs for (int i = 0; i < params.iterations + 1; i++) { err = clEnqueueNDRangeKernel( m_queue, kernel, 2, NULL, global, local, 0, NULL, NULL); CHECK_ERROR_OCL(err, "enqueuing kernel", return false); // Start timing after warm-up run if (i == 0) { err = clFinish(m_queue); CHECK_ERROR_OCL(err, "running kernel", return false); startTiming(); } }
/** * \brief ocl::Image::writeAsync Transfers data from host memory to this Image. * * Waits until the event list is completed. Be sure that the queue * and this Image are in the same context. * \param queue is a command queue on which the command is executed. * \param origin is the 3D offset in bytes from which the Image is read. * \param ptr_to_host_data must point to a memory location whith region bytes available. * \param region is the 3D region of the data. It is given with {image_width, image_height, image_depth}. * \param list contains all events for which this command has to wait. * \return an event which can be further put into an event list for synchronization. */ ocl::Event ocl::Image::writeAsync(const Queue &queue, size_t *origin, const void *ptr_to_host_data, const size_t *region, const EventList &list) const { TRUE_ASSERT(ptr_to_host_data != NULL, "data == 0"); TRUE_ASSERT(queue.context() == *this->context(), "Context of queue and this must be equal"); cl_event event_id; OPENCL_SAFE_CALL( clEnqueueWriteImage(queue.id(), this->id(), CL_FALSE, origin, region, 0, 0, ptr_to_host_data, list.size(), list.events().data(), &event_id) ); return ocl::Event(event_id, this->context()); }
cl_int WINAPI wine_clEnqueueWriteImage(cl_command_queue command_queue, cl_mem image, cl_bool blocking_write, const size_t * origin, const size_t * region, size_t input_row_pitch, size_t input_slice_pitch, const void * ptr, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) { cl_int ret; TRACE("\n"); ret = clEnqueueWriteImage(command_queue, image, blocking_write, origin, region, input_row_pitch, input_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event); return ret; }
void cl_copyTextureToDevice(cl_mem dst, void* src, int width, int height) { cl_int status; const size_t szTexOrigin[3] = {0, 0, 0}; const size_t szTexRegion[3] = {height, width, 1}; status = clEnqueueWriteImage(clCommandQueue, dst, CL_TRUE, szTexOrigin, szTexRegion, 0, 0, src, 0, NULL, NULL); if(cl_errChk(status, "write buffer texture")) { exit(1); } }
/** * \related cl_Mem_Object_t * * This function read data from Host-accessible memory region, defined by argument * 'source' & write that data into OpenCL Image memory object, definded by * argument'self' * @param[in,out] self pointer to structure, in which 'Write' function pointer * is defined to point on this function. * @param[in] blocking_flag flag, that denotes, should operation be blocking or not. * @param[in] source pointer to Host-accessible memory region, that * contain data to be write in OpenCL memory object. * @param[in] time_mode enumeration, that denotes how time measurement should be * performed. * @param[out] evt_to_generate pointer to OpenCL event that will be generated * at the end of operation. * * @return CL_SUCCESS in case of success, error code of type 'ret_code' otherwise. * * @see cl_err_codes.h for detailed error description. * @see 'cl_Error_t' structure for error handling. */ static ret_code Image_Send_To_Device( scow_Mem_Object *self, cl_bool blocking_flag, void *source, TIME_STUDY_MODE time_mode, cl_event *evt_to_generate, cl_command_queue explicit_queue) { cl_int ret = CL_SUCCESS; cl_event write_ready, *p_write_ready = (cl_event*) 0x0; OCL_CHECK_EXISTENCE(self, INVALID_BUFFER_GIVEN); OCL_CHECK_EXISTENCE(source, INVALID_BUFFER_GIVEN); const size_t origin[3] = { 0, 0, 0 }, region[3] = { self->width, self->height, 1 }; (evt_to_generate != NULL) ? (p_write_ready = evt_to_generate) : (p_write_ready = &write_ready); cl_command_queue q = (explicit_queue == NULL) ? (self->parent_thread->q_data_htod) : (explicit_queue); ret = clEnqueueWriteImage(q, self->cl_mem_object, blocking_flag, origin, region, self->row_pitch, 0, source, 0, NULL, p_write_ready); OCL_DIE_ON_ERROR(ret, CL_SUCCESS, NULL, ret); switch (time_mode) { case MEASURE: self->timer->current_time_device = Gather_Time_uS(p_write_ready); self->timer->total_time_device += self->timer->current_time_device; break; case DONT_MEASURE: break; default: break; } if (p_write_ready != evt_to_generate){ clReleaseEvent(*p_write_ready); } return ret; }
bool piglit_cl_write_image(cl_command_queue command_queue, cl_mem image, const size_t *origin, const size_t *region, const void *ptr) { cl_int errNo; errNo = clEnqueueWriteImage(command_queue, image, CL_TRUE, origin, region, 0, 0, ptr, 0, NULL, NULL); if(!piglit_cl_check_error(errNo, CL_SUCCESS)) { fprintf(stderr, "Could not enqueue image write: %s\n", piglit_cl_get_error_name(errNo)); return false; } return true; }
/** Thread that receives image from client. * @param data struct dataTransfer casted variable. * @return NULL */ void *asyncDataRecvImage_thread(void *data) { struct dataSend* _data = (struct dataSend*)data; // Provide a server for the data transfer int fd = accept(_data->fd, (struct sockaddr*)NULL, NULL); if(fd < 0){ // we can't work, disconnect the client printf("ERROR: Can't listen on binded port.\n"); fflush(stdout); shutdown(_data->fd, 2); return CL_SUCCESS; } // We may wait manually for the events generated by ocland, // and then we can wait for the OpenCL generated ones. if(_data->num_events_in_wait_list){ oclandWaitForEvents(_data->num_events_in_wait_list, _data->event_wait_list); } // Receive the data Recv(&fd, _data->ptr, _data->cb, MSG_WAITALL); // Writre it into the buffer clEnqueueWriteImage(_data->command_queue,_data->mem,CL_FALSE, _data->buffer_origin,_data->region, _data->buffer_row_pitch,_data->buffer_slice_pitch, _data->ptr,0,NULL,&(_data->event->event)); // Wait until the data is copied before start cleaning up clWaitForEvents(1,&(_data->event->event)); // Clean up free(_data->ptr); _data->ptr = NULL; if(_data->event){ _data->event->status = CL_COMPLETE; } if(_data->want_event != CL_TRUE){ free(_data->event); _data->event = NULL; } if(_data->event_wait_list) free(_data->event_wait_list); _data->event_wait_list=NULL; // shutdown(fd, 2); // shutdown(_data->fd, 2); // Destroy the server to free the port close(fd); close(_data->fd); free(_data); _data=NULL; pthread_exit(NULL); return NULL; }
bool CL_Image3D::WriteData(const CL_CommandQueue* pCommandQueue, size_t uOriginX, size_t uOriginY, size_t uOriginZ, size_t uWidth, size_t uHeight, size_t uDepth, const void* pImgInput, bool bIsBlocking, CL_Event* pNewEvent, const CL_EventPool* pWaitList) { CL_CPP_CONDITIONAL_RETURN_FALSE(!m_Image); CL_CPP_CONDITIONAL_RETURN_FALSE(!pCommandQueue); const size_t uOrigin[3] = { uOriginX, uOriginY, uOriginZ }; const size_t uRegion[3] = { uWidth, uHeight, uDepth }; cl_uint uNumWaitEvents = pWaitList ? pWaitList->GetNumEvents() : 0; const cl_event* pWaitEvents = pWaitList ? pWaitList->GetEventPool() : NULL; cl_event NewEvent = NULL; // Write some data to the buffer object. const cl_command_queue CommandQueue = pCommandQueue->GetCommandQueue(); cl_int iErrorCode = clEnqueueWriteImage(CommandQueue, m_Image, (bIsBlocking) ? CL_TRUE : CL_FALSE, uOrigin, uRegion, m_uRowPitch, m_uSlicePitch, pImgInput, uNumWaitEvents, pWaitEvents, &NewEvent); CL_CPP_CATCH_ERROR(iErrorCode); CL_CPP_ON_ERROR_RETURN_FALSE(iErrorCode); if(NewEvent) { if(pNewEvent) pNewEvent->SetEvent(NewEvent); clReleaseEvent(NewEvent); } return true; }
cl_mem * push_image(clinfo_t *clinfo, image_t * image, cl_event * event) { cl_mem * image_buffer = malloc(sizeof(cl_mem)); cl_int err; *image_buffer = clCreateImage2D(clinfo->context, CL_MEM_READ_ONLY , image->image_fmt, image->size[0] , image->size[1], 0, 0, &err); if(err != CL_SUCCESS) { fprintf(stderr, "Failed to create image buffer, %i\n", err); return NULL; } size_t origin[] = {0,0,0}; size_t region[] = {image->size[0], image->size[1], 1}; err = clEnqueueWriteImage(clinfo->command_queue, *image_buffer , CL_FALSE, origin, region , 0, 0, *image->pixels, 0, NULL, event); if(err != CL_SUCCESS) { fprintf(stderr, "Failed to write image to memory %i\n", err); return NULL; } return image_buffer; }
/*! \param dst Valid device pointer \param src Host pointer that contains the data \param height Height of the image \param width Width of the image */ void cl_copyImageToDevice(cl_mem dst, void* src, size_t height, size_t width) { static int eventCnt = 0; cl_event* eventPtr = NULL, event; if(eventsEnabled) { eventPtr = &event; } cl_int status; size_t origin[3] = {0, 0, 0}; size_t region[3] = {width, height, 1}; status = clEnqueueWriteImage(commandQueue, dst, CL_TRUE, origin, region, 0, 0, src, 0, NULL, eventPtr); cl_errChk(status, "Writing image", true); if(eventsEnabled) { char* eventStr = catStringWithInt("copyImageToDevice", eventCnt++); events->newIOEvent(*eventPtr, eventStr); } }
int main() { int i,j,k; // nb of operations: const int dsize = 512; int nthreads = 1; int nbOfAverages = 1e2; int opsMAC = 2; // operations per MAC cl_short4 *in, *out; cl_half *ck; double tops; //total ops #define NQUEUES 1 cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queues[NQUEUES]; cl_mem bufin, bufck, bufout; cl_event event = NULL; cl_program program; cl_kernel kernel; size_t global[2], local[2]; size_t param[5]; char version[300]; // allocate matrices in = (cl_short4 *) calloc(dsize*dsize, sizeof(*in)); out = (cl_short4 *) calloc(dsize*dsize, sizeof(*out)); ck = (cl_half *) calloc(9*9, sizeof(*ck)); in[0].x = 0x3c00; in[1].x = 0x4000; in[dsize].x = 0x4100; ck[0] = 0x3c00; ck[1] = 0x4000; ck[9] = 0x3000; /* Setup OpenCL environment. */ err = clGetPlatformIDs( 1, &platform, NULL ); err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL ); props[1] = (cl_context_properties)platform; ctx = clCreateContext( props, 1, &device, NULL, NULL, &err ); for(i = 0; i < NQUEUES; i++) queues[i] = clCreateCommandQueue( ctx, device, 0, &err ); // Print some info about the system clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(version), version, NULL); printf("CL_DEVICE_VERSION=%s\n", version); clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(version), version, NULL); printf("CL_DRIVER_VERSION=%s\n", version); program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err); clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(param[0]), param, NULL); printf("CL_DEVICE_LOCAL_MEM_SIZE=%d\n", (int)param[0]); clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(param[0]), param, NULL); printf("CL_DEVICE_MAX_WORK_GROUP_SIZE=%d\n", (int)param[0]); clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(param[0]), param, NULL); printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS=%d\n", (int)param[0]); j = param[0]; clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(param[0])*j, param, NULL); printf("CL_DEVICE_MAX_WORK_ITEM_SIZES="); for(i = 0; i < j; i++) printf("%d ", (int)param[i]); printf("\n"); clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(param[0]), param, NULL); printf("CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE=%d\n", (int)param[0]); program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err); if(!program) { printf("Error creating program\n"); return -1; } err = clBuildProgram(program, 0, 0, 0, 0, 0); if(err != CL_SUCCESS) { char buffer[20000]; size_t len; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); puts(buffer); return -1; } kernel = clCreateKernel(program, "conv9x9", &err); if(!kernel || err != CL_SUCCESS) { printf("Error creating kernel\n"); return -1; } /* Prepare OpenCL memory objects and place matrices inside them. */ cl_image_format fmt = {CL_RGBA, CL_HALF_FLOAT}; cl_int rc; bufin = clCreateImage2D(ctx, CL_MEM_READ_ONLY, &fmt, dsize, dsize, 0, 0, &rc); bufout = clCreateImage2D(ctx, CL_MEM_WRITE_ONLY, &fmt, dsize, dsize, 0, 0, &rc); bufck = clCreateBuffer( ctx, CL_MEM_READ_ONLY, 9 * 9 * sizeof(*ck), NULL, &err ); size_t origin[3] = {0,0,0}; size_t region[3] = {dsize, dsize, 1}; err = clEnqueueWriteImage(queues[0], bufin, CL_TRUE, origin, region, dsize * sizeof(*in), 0, in, 0, NULL, NULL ); err = clEnqueueWriteBuffer( queues[0], bufck, CL_TRUE, 0, 9 * 9 * sizeof( *ck ), ck, 0, NULL, NULL ); clSetKernelArg(kernel, 0, sizeof(int), &dsize); clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufin); clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufck); clSetKernelArg(kernel, 3, sizeof(cl_mem), &bufout); local[0] = 8; local[1] = 8; global[0] = global[1] = dsize-32; usleep(100000); struct timeval start,end; gettimeofday(&start, NULL); for (k=0; k<nthreads; k++) { //printf("Hello from thread %d, nthreads %d\n", omp_get_thread_num(), omp_get_num_threads()); for(i=0;i<nbOfAverages;i++) { // do the 2D convolution err = clEnqueueNDRangeKernel(queues[0], kernel, 2, NULL, global, local, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("clEnqueueNDRangeKernel error %d\n", err); return -1; } } } clFinish(queues[0]); gettimeofday(&end, NULL); double t = ((double) (end.tv_sec - start.tv_sec)) + ((double) (end.tv_usec - start.tv_usec)) / 1e6; //reports time in [s] - verified! /* Wait for calculations to be finished. */ /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadImage(queues[0], bufout, CL_TRUE, origin, region, dsize * sizeof(*out), 0, out, 0, NULL, NULL ); clFinish(queues[0]); printf("%x %x %x %x\n", out[0].x, out[1].x, out[dsize].x, out[dsize+1].x); /* Release OpenCL memory objects. */ clReleaseMemObject( bufin ); clReleaseMemObject( bufck ); clReleaseMemObject( bufout ); /* Release OpenCL working objects. */ for(i = 0; i < NQUEUES; i++) clReleaseCommandQueue( queues[i] ); clReleaseContext( ctx ); // report performance: tops = 4 * nthreads * opsMAC * (dsize-32)*(dsize-32)*9*9; // total ops printf("Total M ops = %.0lf, # of threads = %d", nbOfAverages*tops*1e-6, nthreads); printf("\nTime in s: %lf:", t); printf("\nTest performance [G OP/s] %lf:", tops*nbOfAverages/t*1e-9); printf("\n"); return(0); }
int main(int argc, char *argv[]) { #ifdef DEBUG printf("Argument count = [%d]\n", argc); #endif if(argc!=2) { printf("Expecting one argument!\n"); exit(1); } if(argv[1]==NULL) { printf("Expecting one non-null argument!\n"); exit(1); } char *progName = argv[1]; char fileName[100]; sprintf(fileName, "./target/%s.cl",progName); printf("Using kernel file [%s], with kernel name [%s]\n", fileName, progName); cl_device_id device_id = NULL; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_program program = NULL; cl_kernel kernel = NULL; cl_platform_id platform_id = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret; float *result; int i; cl_mem image, out; cl_bool support; cl_image_format fmt; int num_out = 9; FILE *fp; char *source_str; size_t source_size, r_size; int mem_size = sizeof(cl_float4) * num_out; /*load the source code containing the kernel*/ fp = fopen (fileName, "r"); if (!fp) { fprintf(stderr, "failed to load kernel.\n"); exit(1); } source_str = (char*)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); /*Get platform and device info*/ ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); printf("ret_num_platforms = %d\n", ret_num_platforms); ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1 ,&device_id, &ret_num_devices); printf("ret_num_platforms = %d\n", ret_num_platforms); context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); result = (float*) malloc(mem_size); //check image support clGetDeviceInfo(device_id, CL_DEVICE_IMAGE_SUPPORT, sizeof(support), &support, &r_size); if (support != CL_TRUE) { puts("image not supported"); return 1; } command_queue = clCreateCommandQueue(context, device_id, 0, &ret); printf("queue ret = %d\n", ret); out = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size, NULL, &ret); printf("create buffer ret = %d\n", ret); fmt.image_channel_order = CL_R; fmt.image_channel_data_type = CL_FLOAT; image = clCreateImage2D(context, CL_MEM_READ_ONLY, &fmt, 4, 4, 0, 0, NULL); size_t origin[] = {0,0,0}; size_t region[] = {4,4,1}; float data[] = { 10,20,30,40, 10,20,30,40, 10,20,30,40, 10,20,30,40 }; clEnqueueWriteImage(command_queue, image, CL_TRUE, origin, region, 4*sizeof(float), 0, data, 0, NULL, NULL); program = clCreateProgramWithSource(context, 1, (const char**) &source_str, (const size_t*) &source_size, &ret); ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); printf("build program ret = %d\n", ret); kernel = clCreateKernel(program, progName, &ret); printf("create kernel ret = %d\n", ret); //How to set int arguments? ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*) &image); printf("arg 0 ret = %d\n", ret); ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*) &out); printf("arg 1 ret = %d\n", ret); cl_event ev; ret = clEnqueueTask(command_queue, kernel, 0, NULL, &ev); //How to read a int? ret = clEnqueueReadBuffer(command_queue, out, CL_TRUE, 0, mem_size, result, 0, NULL, NULL); for(int i=0; i < num_out; i++) { printf("%f,%f,%f,%f\n", result[i*4+0], result[i*4+1], result[i*4+2], result[i*4+3]); } ret=clFlush(command_queue); ret=clFinish(command_queue); ret=clReleaseKernel(kernel); ret=clReleaseProgram(program); ret=clReleaseMemObject(out); ret=clReleaseMemObject(image); ret=clReleaseCommandQueue(command_queue); ret=clReleaseContext(context); free(source_str); printf("\n"); return 0; }
void spmv_csr_ocl(csr_matrix<int, float>* mat, float* vec, float* result, int dim2Size, double& opttime, double& optflop, int& optmethod, char* oclfilename, cl_device_type deviceType, int ntimes, double* floptable, int groupnum) { cl_device_id* devices = NULL; cl_context context = NULL; cl_command_queue cmdQueue = NULL; cl_program program = NULL; assert(initialization(deviceType, devices, &context, &cmdQueue, &program, oclfilename) == 1); cl_int errorCode = CL_SUCCESS; //Create device memory objects cl_mem devRowPtr; cl_mem devColId; cl_mem devData; cl_mem devVec; cl_mem devTexVec; cl_mem devRes; //Initialize values int nnz = mat->matinfo.nnz; int vecsize = mat->matinfo.width; int rownum = mat->matinfo.height; int rowptrsize = rownum + 1; ALLOCATE_GPU_READ(devRowPtr, mat->csr_row_ptr, sizeof(int)*rowptrsize); ALLOCATE_GPU_READ(devColId, mat->csr_col_id, sizeof(int)*nnz); ALLOCATE_GPU_READ(devData, mat->csr_data, sizeof(float)*nnz); ALLOCATE_GPU_READ(devVec, vec, sizeof(float)*vecsize); int paddedres = findPaddedSize(rownum, 16); devRes = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*paddedres, NULL, &errorCode); CHECKERROR; //errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; const cl_image_format floatFormat = { CL_R, CL_FLOAT, }; int width = VEC2DWIDTH; int height = (vecsize + VEC2DWIDTH - 1)/VEC2DWIDTH; float* image2dVec = (float*)malloc(sizeof(float)*width*height); memset(image2dVec, 0, sizeof(float)*width*height); for (int i = 0; i < vecsize; i++) { image2dVec[i] = vec[i]; } size_t origin[] = {0, 0, 0}; size_t vectorSize[] = {width, height, 1}; devTexVec = clCreateImage2D(context, CL_MEM_READ_ONLY, &floatFormat, width, height, 0, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteImage(cmdQueue, devTexVec, CL_TRUE, origin, vectorSize, 0, 0, image2dVec, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); opttime = 10000.0f; optmethod = 0; int dim2 = dim2Size; { int methodid = 0; cl_mem devRowPtrPad; int padrowsize = findPaddedSize(rownum, CSR_VEC_GROUP_SIZE/WARPSIZE); int* rowptrpad = (int*)malloc(sizeof(int)*(padrowsize+1)); memset(rowptrpad, 0, sizeof(int)*(padrowsize+1)); for (int i = 0; i <= mat->matinfo.height; i++) rowptrpad[i] = mat->csr_row_ptr[i]; ALLOCATE_GPU_READ(devRowPtrPad, rowptrpad, sizeof(int)*(padrowsize+1)); clFinish(cmdQueue); printf("\nRow Num %d padded size %d\n", rownum, padrowsize); cl_uint work_dim = 2; //int dim2 = 16; size_t blocksize[] = {CSR_VEC_GROUP_SIZE, 1}; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, "gpu_csr_ve_slm_pm_fs", &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devRowPtrPad); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devColId); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(cl_mem), &devVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(int), &rownum); CHECKERROR; { size_t globalsize[] = {groupnum * CSR_VEC_GROUP_SIZE, dim2}; for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nCSR vector SLM row ptr padded mat strided rows fixed size:%d cpu time %lf ms GFLOPS %lf code %d \n\n", groupnum * CSR_VEC_GROUP_SIZE, time_in_sec / (double) ntimes * 1000, gflops, methodid); double onetime = time_in_sec / (double) ntimes; floptable[methodid] = gflops; if (onetime < opttime) { opttime = onetime; optmethod = methodid; optflop = gflops; } } if (devRowPtrPad) clReleaseMemObject(devRowPtrPad); if (csrKernel) clReleaseKernel(csrKernel); free(rowptrpad); } { int methodid = 1; cl_mem devRowPtrPad; int padrowsize = findPaddedSize(rownum, CSR_VEC_GROUP_SIZE/WARPSIZE); int* rowptrpad = (int*)malloc(sizeof(int)*(padrowsize+1)); memset(rowptrpad, 0, sizeof(int)*(padrowsize+1)); for (int i = 0; i <= mat->matinfo.height; i++) rowptrpad[i] = mat->csr_row_ptr[i]; ALLOCATE_GPU_READ(devRowPtrPad, rowptrpad, sizeof(int)*(padrowsize+1)); clFinish(cmdQueue); printf("\nRow Num %d padded size %d\n", rownum, padrowsize); cl_uint work_dim = 2; //int dim2 = 16; size_t blocksize[] = {CSR_VEC_GROUP_SIZE, 1}; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, "gpu_csr_ve_reduction_fs", &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devRowPtrPad); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devColId); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(cl_mem), &devVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(int), &rownum); CHECKERROR; { size_t globalsize[] = {groupnum * CSR_VEC_GROUP_SIZE, dim2}; for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nCSR vector SLM row ptr padded mat strided rows fixed size:%d cpu time %lf ms GFLOPS %lf code %d \n\n", groupnum * CSR_VEC_GROUP_SIZE, time_in_sec / (double) ntimes * 1000, gflops, methodid); double onetime = time_in_sec / (double) ntimes; floptable[methodid] = gflops; if (onetime < opttime) { opttime = onetime; optmethod = methodid; optflop = gflops; } } if (devRowPtrPad) clReleaseMemObject(devRowPtrPad); if (csrKernel) clReleaseKernel(csrKernel); free(rowptrpad); } //Clean up if (image2dVec) free(image2dVec); if (devRowPtr) clReleaseMemObject(devRowPtr); if (devColId) clReleaseMemObject(devColId); if (devData) clReleaseMemObject(devData); if (devVec) clReleaseMemObject(devVec); if (devTexVec) clReleaseMemObject(devTexVec); if (devRes) clReleaseMemObject(devRes); freeObjects(devices, &context, &cmdQueue, &program); }
int main(int argc, const char** argv) { size_t x = 512, y = 250000; //y has to be a multiple of ciDeviceCount! struct svm_node* px = (struct svm_node*)malloc((x+1)*sizeof(struct svm_node)); gen_data(px, x, 1, 3); struct svm_node* py = (struct svm_node*)malloc((x+1)*y*sizeof(struct svm_node)); for(size_t i = 0; i < y; ++i) { struct svm_node* tmp = py+i*(x+1); gen_data(tmp, x, 3,2); } dtype* result = (dtype*)malloc(y*sizeof(dtype)); int* pyLength = (int*)malloc(y*sizeof(int)); for(size_t i = 0; i < y; ++i) { for(size_t j = 0; py[i*(x+1)+j].index >= 0; ++j) pyLength[i] = py[i*(x+1)+j].index; ++pyLength[i]; } cl_int err = CL_SUCCESS; // cl_platform_id platform = NULL; // cl_uint ciDeviceCount = 0; // cl_device_id *device = NULL; // retrieve devices cl_platform_id platform; err = clGetPlatformIDs(1, &platform, NULL); cl_device_id device; err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL); size_t localDim = 256l; size_t globalDim = localDim*y; /* device = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) ); err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, ciDeviceCount, device, NULL); if (err != CL_SUCCESS) { printf("Failed to get devices:\n%s\n", oclErrorString(err)); return -1; } */ //Create the context cl_context context1 = clCreateContext(0, 1, &device, NULL, NULL, &err); if(err != CL_SUCCESS) { printf("Context creation failed:\n%d\n", err); return -1; } // create a command queue for first device the context reported cl_command_queue queue = clCreateCommandQueue(context1, device, 0, 0); // load program from disk char *tmp = strdup(argv[0]); char* my_dir = dirname(tmp); // size_t program_length; char path[256]; snprintf(path, PATH_MAX - 1, "%s/vecops.cl", my_dir); cl_program vecops = load_kernel(path, context1); if(err != CL_SUCCESS) { printf("Program creation failed:\n%d\n", (err)); return -1; } err = clBuildProgram(vecops, 0, NULL, "-I.", NULL, NULL); if(err != CL_SUCCESS) { err = clGetProgramBuildInfo(vecops, device, CL_PROGRAM_BUILD_LOG, 8192, buffer, NULL); if(err != CL_SUCCESS) printf("Cannot get build info: %d\n", (err)); printf("Build log:\n%s\n", buffer); } // create kernel cl_kernel sparsedot_kernel; #if version == 1 sparsedot_kernel = clCreateKernel(vecops, "sparsedot1_kernel", &err); #endif #if version == 2 sparsedot_kernel = clCreateKernel(vecops, "sparsedot4_kernel", &err); #endif #if version == 3 sparsedot_kernel = clCreateKernel(vecops, "sparsedot3_kernel", &err); #endif if (err != CL_SUCCESS) { printf("Kernel creation failed:\n%d\n", (err)); return -1; } // allocate memory on the devices cl_mem px_d, py_d, result_d, pyLength_d; #if version == 1 px_d = clCreateBuffer(context1, CL_MEM_READ_ONLY, (x+1) * sizeof(struct svm_node), 0, &err); #endif #if version == 2 || version == 3 //unpack px int size = px[x-1].index+1; for(size_t i = 0; i < y; ++i) size = size > pyLength[i] ? size : pyLength[i]; dtype* px_u = (dtype*)calloc(size, sizeof(dtype)); unpack(px, px_u); printf("px size: %d\n", size); #endif #if version == 3 size_t height, width; clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &height, 0); clGetDeviceInfo(Device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &width, 0); size_t region[3]; region[2] = 1; region[0] = min(4, size); region[1] = (size+2-1) / 4; cl_image_format px_format; px_format.image_channel_order = CL_R; px_format.image_channel_data_type = CL_FLOAT; #endif #if version == 2 px_d = clCreateBuffer(context1, CL_MEM_READ_ONLY, size * sizeof(dtype), 0, &err); #endif #if version == 3 px_d = clCreateImage2D(context1, CL_MEM_READ_ONLY, &px_format, region[0], region[1], 0, 0, &err); #endif if(err != CL_SUCCESS) { printf("Failed to allocate px:\n%d\n", (err)); return -1; } py_d = clCreateBuffer(context1, CL_MEM_READ_ONLY, (x+1) * y * sizeof(struct svm_node), 0, &err); if(err != CL_SUCCESS) { printf("Failed to allocate px:\n%d\n", (err)); return -1; } result_d = clCreateBuffer(context1, CL_MEM_WRITE_ONLY, y * sizeof(dtype), 0, 0); pyLength_d = clCreateBuffer(context1, CL_MEM_READ_ONLY, y * sizeof(int), 0, 0); #if bench //start time measurement start_timer(0); #endif // copy host vectors to device err = CL_SUCCESS; err |= clEnqueueWriteBuffer(queue, py_d, CL_FALSE, 0, (x+1) * y * sizeof(struct svm_node), py, 0, NULL, NULL); err |= clEnqueueWriteBuffer(queue, pyLength_d, CL_FALSE, 0, y * sizeof(int), pyLength, 0, NULL, NULL); #if version == 1 err |= clEnqueueWriteBuffer(queue, px_d, CL_FALSE, 0, (x+1) * sizeof(struct svm_node), px, 0, NULL, NULL); #endif #if version == 2 err |= clEnqueueWriteBuffer(queue, px_d, CL_FALSE, 0, size * sizeof(dtype), px_u, 0, NULL, NULL); #endif #if version == 3 size_t offset[] = {0,0,0}; err |= clEnqueueWriteImage(queue, px_d, CL_TRUE, offset, region, sizeof(dtype), 0, px_u, 0, 0, NULL); #endif clFinish(queue); if(err != CL_SUCCESS) { printf("Data transfer to GPU failed:\n%d\n", (err)); return -1; } #if bench stop_timer(0); start_timer(1); #endif // set kernel arguments clSetKernelArg(sparsedot_kernel, 0, sizeof(cl_mem), (void *) &px_d); clSetKernelArg(sparsedot_kernel, 1, sizeof(cl_mem), (void *) &py_d); clSetKernelArg(sparsedot_kernel, 2, sizeof(cl_mem), (void *) &result_d); clSetKernelArg(sparsedot_kernel, 3, sizeof(cl_mem), (void *) &pyLength_d); clSetKernelArg(sparsedot_kernel, 4, sizeof(cl_ulong), (void *) &x); clSetKernelArg(sparsedot_kernel, 5, sizeof(cl_ulong), (void *) &y); // clSetKernelArg(sparsedot_kernel, 6, sizeof(cl_float8)*localDim, 0); #if version == 3 clSetKernelArg(sparsedot_kernel, 7, sizeof(cl_long), (void *) ®ion[1]) ; clSetKernelArg(sparsedot_kernel, 8, sizeof(cl_long), (void *) ®ion[0]) ; #endif clFlush(queue); // start kernel err = clEnqueueNDRangeKernel(queue, sparsedot_kernel, 1, 0, &globalDim, &localDim, 0, NULL, 0); if(err != CL_SUCCESS) { printf("Kernel launch failed:\n%d\n", (err)); return -1; } clFinish(queue); #if bench stop_timer(1); start_timer(2); #endif cl_event result_gather; // Non-blocking copy of result from device to host err = clEnqueueReadBuffer(queue, result_d, CL_FALSE, 0, y * sizeof(dtype), result, 0, NULL, &result_gather); if(err != CL_SUCCESS) { printf("Reading result failed:\n%d\n", (err)); return -1; } // CPU sync with GPU clWaitForEvents(1, &result_gather); #if bench // stop GPU time measurement stop_timer(2); #endif //check result /* for(size_t i = 0; i < y; ++i) { printf("%f ", result[i]); } printf("\n"); */ #if bench start_timer(3); #endif bool correct = validate(px, py, result, x, y); #if bench stop_timer(3); printf("v%i; x: %lu, y: %lu\n", version, x, y); printf("CPU: %f, upcpy: %f DeviceCalc: %f, downcpy: %f\n", get_secs(3), get_secs(0), get_secs(1), get_secs(2)); #endif if(correct) printf("SUCCESS!\n"); //cleenup clReleaseKernel(sparsedot_kernel); clReleaseCommandQueue(queue); clReleaseEvent(result_gather); clReleaseMemObject(px_d); clReleaseMemObject(py_d); clReleaseMemObject(result_d); clReleaseMemObject(pyLength_d); // clReleaseDevice(device); free(px); #if version == 2 || version == 3 free(px_u); #endif free(py); free(result); return 0; }
void spmv_coo_ocl(coo_matrix<int, float>* mat, float* vec, float* result, int dim2Size, double& opttime, double& optflop, int& optmethod, char* oclfilename, cl_device_type deviceType, int ntimes, double* floptable, int maxgroupnum) { for (int i = 0; i < mat->matinfo.height; i++) result[i] = 0.0f; cl_device_id* devices = NULL; cl_context context = NULL; cl_command_queue cmdQueue = NULL; cl_program program = NULL; assert(initialization(deviceType, devices, &context, &cmdQueue, &program, oclfilename) == 1); cl_int errorCode = CL_SUCCESS; //Create device memory objects cl_mem devRowid; cl_mem devColid; cl_mem devData; cl_mem devVec; cl_mem devRes; cl_mem devTexVec; cl_mem devTmpRow; cl_mem devTmpData; //Initialize values int nnz = mat->matinfo.nnz; int rownum = mat->matinfo.height; int vecsize = mat->matinfo.width; int num_units = nnz / COO_GROUP_SIZE; if (nnz % COO_GROUP_SIZE != 0) num_units++; int group_num = (num_units < maxgroupnum) ? num_units : maxgroupnum; int work_size = group_num * COO_GROUP_SIZE; int num_iters = nnz / work_size; if (nnz % work_size != 0) num_iters++; int process_size = num_iters * COO_GROUP_SIZE; int active_warp = num_units / num_iters; if (num_units % num_iters != 0) active_warp++; int paddedNNZ = findPaddedSize(nnz, COO_ALIGNMENT); int* paddedRow = (int*)malloc(sizeof(int)*paddedNNZ); int* paddedCol = (int*)malloc(sizeof(int)*paddedNNZ); float* paddedData = (float*)malloc(sizeof(float)*paddedNNZ); memcpy(paddedRow, mat->coo_row_id, sizeof(int)*nnz); memcpy(paddedCol, mat->coo_col_id, sizeof(int)*nnz); memcpy(paddedData, mat->coo_data, sizeof(float)*nnz); for (int i = nnz; i < paddedNNZ; i++) { paddedRow[i] = mat->coo_row_id[nnz - 1]; paddedCol[i] = mat->coo_col_id[nnz - 1]; paddedData[i] = 0.0f; } ALLOCATE_GPU_READ(devRowid, paddedRow, sizeof(int)*paddedNNZ); ALLOCATE_GPU_READ(devColid, paddedCol, sizeof(int)*paddedNNZ); ALLOCATE_GPU_READ(devData, paddedData, sizeof(float)*paddedNNZ); ALLOCATE_GPU_READ(devVec, vec, sizeof(float)*vecsize); int paddedres = findPaddedSize(rownum, 512); devRes = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*paddedres, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; devTmpRow = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*maxgroupnum, NULL, &errorCode); CHECKERROR; devTmpData = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*maxgroupnum, NULL, &errorCode); CHECKERROR; const cl_image_format floatFormat = { CL_R, CL_FLOAT, }; int width = VEC2DWIDTH; int height = (vecsize + VEC2DWIDTH - 1)/VEC2DWIDTH; float* image2dVec = (float*)malloc(sizeof(float)*width*height); memset(image2dVec, 0, sizeof(float)*width*height); for (int i = 0; i < vecsize; i++) { image2dVec[i] = vec[i]; } size_t origin[] = {0, 0, 0}; size_t vectorSize[] = {width, height, 1}; devTexVec = clCreateImage2D(context, CL_MEM_READ_ONLY, &floatFormat, width, height, 0, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteImage(cmdQueue, devTexVec, CL_TRUE, origin, vectorSize, 0, 0, image2dVec, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); opttime = 10000.0f; optmethod = 0; int dim2 = dim2Size; { int methodid = 0; cl_uint work_dim = 2; size_t blocksize[] = {COO_GROUP_SIZE, 1}; int gsize = group_num * COO_GROUP_SIZE; size_t globalsize[] = {gsize, dim2}; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, "gpu_coo_s1", &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devRowid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devColid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(int), &process_size); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(int), &paddedNNZ); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(cl_mem), &devVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 6, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 7, sizeof(cl_mem), &devTmpRow); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 8, sizeof(cl_mem), &devTmpData); CHECKERROR; printf("process size %d nnz %d gsize %d active_warp %d\n", process_size, paddedNNZ, gsize, active_warp); size_t blocksize2[] = {COO_GROUP_SIZE * 2, 1}; size_t globalsize2[] = {COO_GROUP_SIZE * 2, dim2}; cl_kernel csrKernel2 = NULL; csrKernel2 = clCreateKernel(program, "gpu_coo_s2", &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel2, 0, sizeof(cl_mem), &devTmpRow); CHECKERROR; errorCode = clSetKernelArg(csrKernel2, 1, sizeof(cl_mem), &devTmpData); CHECKERROR; errorCode = clSetKernelArg(csrKernel2, 2, sizeof(int), &active_warp); CHECKERROR; errorCode = clSetKernelArg(csrKernel2, 3, sizeof(cl_mem), &devRes); CHECKERROR; for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); //int* tmpRow = (int*)malloc(sizeof(int)*maxgroupnum); //float* tmpData = (float*)malloc(sizeof(float)*maxgroupnum); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel2, work_dim, NULL, globalsize2, blocksize2, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nCOO cpu time %lf ms GFLOPS %lf code %d \n\n", time_in_sec / (double) ntimes * 1000, gflops, methodid); if (csrKernel) clReleaseKernel(csrKernel); if (csrKernel2) clReleaseKernel(csrKernel2); double onetime = time_in_sec / (double) ntimes; floptable[methodid] = gflops; if (onetime < opttime) { opttime = onetime; optmethod = methodid; optflop = gflops; } //for (int i = 0; i < active_warp; i++) //printf("Row %d Data %f\n", tmpRow[i], tmpData[i]); } //Clean up if (paddedRow) free(paddedRow); if (paddedCol) free(paddedCol); if (paddedData) free(paddedData); if (image2dVec) free(image2dVec); if (devRowid) clReleaseMemObject(devRowid); if (devColid) clReleaseMemObject(devColid); if (devData) clReleaseMemObject(devData); if (devVec) clReleaseMemObject(devVec); if (devTexVec) clReleaseMemObject(devTexVec); if (devRes) clReleaseMemObject(devRes); if (devTmpRow) clReleaseMemObject(devTmpRow); if (devTmpData) clReleaseMemObject(devTmpData); freeObjects(devices, &context, &cmdQueue, &program); }
/*! \brief Calls an OpenCL kernel from OpenVX Graph. * Steps: * \arg Find the target * \arg Get the vxcl context * \arg Find the kernel (to get cl kernel information) * \arg for each input parameter that is an object, enqueue write * \arg wait for finish * \arg for each parameter, SetKernelArg * \arg call kernel * \arg wait for finish * \arg for each output parameter that is an object, enqueue read * \arg wait for finish * \note This implementation will attempt to use the External API as much as possible, * but will cast to internal representation when needed (due to lack of API or * need for secret information). This is not an optimal OpenCL invocation. */ vx_status vxclCallOpenCLKernel(vx_node node, const vx_reference *parameters, vx_uint32 num) { vx_status status = VX_FAILURE; vx_context context = node->base.context; vx_target target = (vx_target_t *)&node->base.context->targets[node->affinity]; vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration); vx_uint32 pidx, pln, didx, plidx, argidx; cl_int err = 0; size_t off_dim[3] = {0,0,0}; size_t work_dim[3]; //size_t local_dim[3]; cl_event writeEvents[VX_INT_MAX_PARAMS]; cl_event readEvents[VX_INT_MAX_PARAMS]; cl_int we = 0, re = 0; vxSemWait(&target->base.lock); // determine which platform to use plidx = 0; // determine which device to use didx = 0; /* for each input/bi data object, enqueue it and set the kernel parameters */ for (argidx = 0, pidx = 0; pidx < num; pidx++) { vx_reference ref = node->parameters[pidx]; vx_enum dir = node->kernel->signature.directions[pidx]; vx_enum type = node->kernel->signature.types[pidx]; vx_memory_t *memory = NULL; switch (type) { case VX_TYPE_ARRAY: memory = &((vx_array)ref)->memory; break; case VX_TYPE_CONVOLUTION: memory = &((vx_convolution)ref)->base.memory; break; case VX_TYPE_DISTRIBUTION: memory = &((vx_distribution)ref)->memory; break; case VX_TYPE_IMAGE: memory = &((vx_image)ref)->memory; break; case VX_TYPE_LUT: memory = &((vx_lut_t*)ref)->memory; break; case VX_TYPE_MATRIX: memory = &((vx_matrix)ref)->memory; break; //case VX_TYPE_PYRAMID: // break; case VX_TYPE_REMAP: memory = &((vx_remap)ref)->memory; break; //case VX_TYPE_SCALAR: //case VX_TYPE_THRESHOLD: // break; } if (memory) { for (pln = 0; pln < memory->nptrs; pln++) { if (memory->cl_type == CL_MEM_OBJECT_BUFFER) { if (type == VX_TYPE_IMAGE) { /* set the work dimensions */ work_dim[0] = memory->dims[pln][VX_DIM_X]; work_dim[1] = memory->dims[pln][VX_DIM_Y]; // width, height, stride_x, stride_y err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->dims[pln][VX_DIM_X]); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->dims[pln][VX_DIM_Y]); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]); VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 4 parameters\n"); } else if (type == VX_TYPE_ARRAY || type == VX_TYPE_LUT) { vx_array arr = (vx_array)ref; // sizeof item, active count, capacity err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&arr->item_size); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&arr->num_items); // this is output? err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&arr->capacity); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &arr->memory.strides[VX_DIM_X]); VX_PRINT(VX_ZONE_INFO, "Setting vx_buffer as Buffer with 4 parameters\n"); } else if (type == VX_TYPE_MATRIX) { vx_matrix mat = (vx_matrix)ref; // columns, rows err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&mat->columns); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&mat->rows); VX_PRINT(VX_ZONE_INFO, "Setting vx_matrix as Buffer with 2 parameters\n"); } else if (type == VX_TYPE_DISTRIBUTION) { vx_distribution dist = (vx_distribution)ref; // num, range, offset, winsize vx_uint32 range = dist->memory.dims[0][VX_DIM_X] * dist->window_x; err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&dist->memory.dims[VX_DIM_X]); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&range); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&dist->offset_x); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&dist->window_x); } else if (type == VX_TYPE_CONVOLUTION) { vx_convolution conv = (vx_convolution)ref; // columns, rows, scale err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&conv->base.columns); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&conv->base.rows); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&conv->scale); } err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(cl_mem), &memory->hdls[pln]); CL_ERROR_MSG(err, "clSetKernelArg"); if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) { err = clEnqueueWriteBuffer(context->queues[plidx][didx], memory->hdls[pln], CL_TRUE, 0, vxComputeMemorySize(memory, pln), memory->ptrs[pln], 0, NULL, &ref->event); } } else if (memory->cl_type == CL_MEM_OBJECT_IMAGE2D) { vx_rectangle_t rect = {0}; vx_image image = (vx_image)ref; vxGetValidRegionImage(image, &rect); size_t origin[3] = {rect.start_x, rect.start_y, 0}; size_t region[3] = {rect.end_x-rect.start_x, rect.end_y-rect.start_y, 1}; /* set the work dimensions */ work_dim[0] = rect.end_x-rect.start_x; work_dim[1] = rect.end_y-rect.start_y; VX_PRINT(VX_ZONE_INFO, "Setting vx_image as image2d_t wd={%zu,%zu} arg:%u\n",work_dim[0], work_dim[1], argidx); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(cl_mem), &memory->hdls[pln]); CL_ERROR_MSG(err, "clSetKernelArg"); if (err != CL_SUCCESS) { VX_PRINT(VX_ZONE_ERROR, "Error Calling Kernel %s, parameter %u\n", node->kernel->name, pidx); } if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) { err = clEnqueueWriteImage(context->queues[plidx][didx], memory->hdls[pln], CL_TRUE, origin, region, memory->strides[pln][VX_DIM_Y], 0, memory->ptrs[pln], 0, NULL, &ref->event); CL_ERROR_MSG(err, "clEnqueueWriteImage"); } } } } else { if (type == VX_TYPE_SCALAR) { vx_value_t value; // largest platform atomic vx_size size = 0ul; vx_scalar sc = (vx_scalar)ref; vx_enum stype = VX_TYPE_INVALID; vxReadScalarValue(sc, &value); vxQueryScalar(sc, VX_SCALAR_ATTRIBUTE_TYPE, &stype, sizeof(stype)); size = vxSizeOfType(stype); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, size, &value); } else if (type == VX_TYPE_THRESHOLD) { vx_enum ttype = 0; vx_threshold th = (vx_threshold)ref; vxQueryThreshold(th, VX_THRESHOLD_ATTRIBUTE_TYPE, &ttype, sizeof(ttype)); if (ttype == VX_THRESHOLD_TYPE_BINARY) { err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint8), &th->value); } else if (ttype == VX_THRESHOLD_TYPE_RANGE) { err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint8), &th->lower); err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint8), &th->upper); } } } } we = 0; for (pidx = 0; pidx < num; pidx++) { vx_reference ref = node->parameters[pidx]; vx_enum dir = node->kernel->signature.directions[pidx]; if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) { memcpy(&writeEvents[we++],&ref->event, sizeof(cl_event)); } } //local_dim[0] = 1; //local_dim[1] = 1; err = clEnqueueNDRangeKernel(context->queues[plidx][didx], vxclk->kernels[plidx], 2, off_dim, work_dim, NULL, we, writeEvents, &node->base.event); CL_ERROR_MSG(err, "clEnqueueNDRangeKernel"); /* enqueue a read on all output data */ for (pidx = 0; pidx < num; pidx++) { vx_reference ref = node->parameters[pidx]; vx_enum dir = node->kernel->signature.directions[pidx]; vx_enum type = node->kernel->signature.types[pidx]; if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL) { vx_memory_t *memory = NULL; switch (type) { case VX_TYPE_ARRAY: memory = &((vx_array)ref)->memory; break; case VX_TYPE_CONVOLUTION: memory = &((vx_convolution)ref)->base.memory; break; case VX_TYPE_DISTRIBUTION: memory = &((vx_distribution)ref)->memory; break; case VX_TYPE_IMAGE: memory = &((vx_image)ref)->memory; break; case VX_TYPE_LUT: memory = &((vx_lut_t*)ref)->memory; break; case VX_TYPE_MATRIX: memory = &((vx_matrix)ref)->memory; break; //case VX_TYPE_PYRAMID: // break; case VX_TYPE_REMAP: memory = &((vx_remap)ref)->memory; break; //case VX_TYPE_SCALAR: //case VX_TYPE_THRESHOLD: // break; } if (memory) { for (pln = 0; pln < memory->nptrs; pln++) { if (memory->cl_type == CL_MEM_OBJECT_BUFFER) { err = clEnqueueReadBuffer(context->queues[plidx][didx], memory->hdls[pln], CL_TRUE, 0, vxComputeMemorySize(memory, pln), memory->ptrs[pln], 1, &node->base.event, &ref->event); CL_ERROR_MSG(err, "clEnqueueReadBuffer"); } else if (memory->cl_type == CL_MEM_OBJECT_IMAGE2D) { vx_rectangle_t rect = {0}; vx_image image = (vx_image)ref; vxGetValidRegionImage(image, &rect); size_t origin[3] = {rect.start_x, rect.start_y, 0}; size_t region[3] = {rect.end_x-rect.start_x, rect.end_y-rect.start_y, 1}; /* set the work dimensions */ work_dim[0] = rect.end_x-rect.start_x; work_dim[1] = rect.end_y-rect.start_y; err = clEnqueueReadImage(context->queues[plidx][didx], memory->hdls[pln], CL_TRUE, origin, region, memory->strides[pln][VX_DIM_Y], 0, memory->ptrs[pln], 1, &node->base.event, &ref->event); CL_ERROR_MSG(err, "clEnqueueReadImage"); VX_PRINT(VX_ZONE_INFO, "Reading Image wd={%zu,%zu}\n", work_dim[0], work_dim[1]); } } } } } re = 0; for (pidx = 0; pidx < num; pidx++) { vx_reference ref = node->parameters[pidx]; vx_enum dir = node->kernel->signature.directions[pidx]; if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL) { memcpy(&readEvents[re++],&ref->event, sizeof(cl_event)); } } err = clFlush(context->queues[plidx][didx]); CL_ERROR_MSG(err, "Flush"); VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n"); clWaitForEvents(re, readEvents); if (err == CL_SUCCESS) status = VX_SUCCESS; //exit: VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status); vxSemPost(&target->base.lock); return status; }
void spmv_b4ell_ocl(b4ell_matrix<int, float>* mat, float* vec, float* result, int dim2Size, double& opttime, int& optmethod, char* oclfilename, cl_device_type deviceType, float* coores, int ntimes, int bw, int bh) { cl_device_id* devices = NULL; cl_context context = NULL; cl_command_queue cmdQueue = NULL; cl_program program = NULL; assert(initialization(deviceType, devices, &context, &cmdQueue, &program, oclfilename) == 1); cl_int errorCode = CL_SUCCESS; //Create device memory objects cl_mem devColid; cl_mem devData; cl_mem devVec; cl_mem devRes; cl_mem devTexVec; //Initialize values int col_align = mat->b4ell_height_aligned; int data_align = mat->b4ell_float4_aligned; int nnz = mat->matinfo.nnz; int rownum = mat->matinfo.height; int blockrownum = mat->b4ell_row_num; int vecsize = mat->matinfo.width; int b4ellnum = mat->b4ell_block_num; int bwidth = mat->b4ell_bwidth; int bheight = mat->b4ell_bheight; int width4num = bwidth / 4; int padveclen = findPaddedSize(vecsize, 8); float* paddedvec = (float*)malloc(sizeof(float)*padveclen); memset(paddedvec, 0, sizeof(float)*padveclen); memcpy(paddedvec, vec, sizeof(float)*vecsize); ALLOCATE_GPU_READ(devColid, mat->b4ell_col_id, sizeof(int)*col_align*b4ellnum); ALLOCATE_GPU_READ(devData, mat->b4ell_data, sizeof(float)*data_align*bheight*width4num*b4ellnum); ALLOCATE_GPU_READ(devVec, paddedvec, sizeof(float)*padveclen); int paddedres = findPaddedSize(rownum, 512); devRes = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*paddedres, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; const cl_image_format floatFormat = { CL_RGBA, CL_FLOAT, }; int width = VEC2DWIDTH; int height = (vecsize + VEC2DWIDTH - 1)/VEC2DWIDTH; if (height % 4 != 0) height += (4 - (height % 4)); float* image2dVec = (float*)malloc(sizeof(float)*width*height); memset(image2dVec, 0, sizeof(float)*width*height); for (int i = 0; i < vecsize; i++) { image2dVec[i] = vec[i]; } size_t origin[] = {0, 0, 0}; size_t vectorSize[] = {width, height/4, 1}; devTexVec = clCreateImage2D(context, CL_MEM_READ_ONLY, &floatFormat, width, height/4, 0, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteImage(cmdQueue, devTexVec, CL_TRUE, origin, vectorSize, 0, 0, image2dVec, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); //printf("\nvec length %d padded length %d", mat->matinfo.width, padveclength); opttime = 10000.0f; optmethod = 0; int dim2 = dim2Size; { int methodid = 0; cl_uint work_dim = 2; size_t blocksize[] = {BELL_GROUP_SIZE, 1}; int gsize = ((blockrownum + BELL_GROUP_SIZE - 1)/BELL_GROUP_SIZE)*BELL_GROUP_SIZE; size_t globalsize[] = {gsize, dim2}; int data_align4 = data_align / 4; char kernelname[100] = "gpu_bell00"; kernelname[8] += bh; kernelname[9] += bw; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, kernelname, &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devColid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(int), &data_align4); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(int), &col_align); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(int), &b4ellnum); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(cl_mem), &devVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 6, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 7, sizeof(int), &blockrownum); CHECKERROR; errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); float* tmpresult = (float*)malloc(sizeof(float)*rownum); errorCode = clEnqueueReadBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, tmpresult, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); two_vec_compare(coores, tmpresult, rownum); free(tmpresult); for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nBELL %dx%d block cpu time %lf ms GFLOPS %lf code %d \n\n", bh, bw, time_in_sec / (double) ntimes * 1000, gflops, methodid); if (csrKernel) clReleaseKernel(csrKernel); double onetime = time_in_sec / (double) ntimes; if (onetime < opttime) { opttime = onetime; optmethod = methodid; } } { int methodid = 1; cl_uint work_dim = 2; size_t blocksize[] = {BELL_GROUP_SIZE, 1}; int gsize = ((blockrownum + BELL_GROUP_SIZE - 1)/BELL_GROUP_SIZE)*BELL_GROUP_SIZE; size_t globalsize[] = {gsize, dim2}; int data_align4 = data_align / 4; char kernelname[100] = "gpu_bell00_mad"; kernelname[8] += bh; kernelname[9] += bw; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, kernelname, &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devColid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(int), &data_align4); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(int), &col_align); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(int), &b4ellnum); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(cl_mem), &devVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 6, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 7, sizeof(int), &blockrownum); CHECKERROR; errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); float* tmpresult = (float*)malloc(sizeof(float)*rownum); errorCode = clEnqueueReadBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, tmpresult, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); two_vec_compare(coores, tmpresult, rownum); free(tmpresult); for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nBELL %dx%d block mad cpu time %lf ms GFLOPS %lf code %d \n\n", bh, bw, time_in_sec / (double) ntimes * 1000, gflops, methodid); if (csrKernel) clReleaseKernel(csrKernel); double onetime = time_in_sec / (double) ntimes; if (onetime < opttime) { opttime = onetime; optmethod = methodid; } } { int methodid = 100; cl_uint work_dim = 2; size_t blocksize[] = {BELL_GROUP_SIZE, 1}; int gsize = ((blockrownum + BELL_GROUP_SIZE - 1)/BELL_GROUP_SIZE)*BELL_GROUP_SIZE; size_t globalsize[] = {gsize, dim2}; int data_align4 = data_align / 4; char kernelname[100] = "gpu_bell00_tx"; kernelname[8] += bh; kernelname[9] += bw; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, kernelname, &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devColid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(int), &data_align4); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(int), &col_align); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(int), &b4ellnum); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(cl_mem), &devTexVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 6, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 7, sizeof(int), &blockrownum); CHECKERROR; errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); float* tmpresult = (float*)malloc(sizeof(float)*rownum); errorCode = clEnqueueReadBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, tmpresult, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); two_vec_compare(coores, tmpresult, rownum); free(tmpresult); for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nBELL %dx%d block tx cpu time %lf ms GFLOPS %lf code %d \n\n", bh, bw, time_in_sec / (double) ntimes * 1000, gflops, methodid); if (csrKernel) clReleaseKernel(csrKernel); double onetime = time_in_sec / (double) ntimes; if (onetime < opttime) { opttime = onetime; optmethod = methodid; } } { int methodid = 101; cl_uint work_dim = 2; size_t blocksize[] = {BELL_GROUP_SIZE, 1}; int gsize = ((blockrownum + BELL_GROUP_SIZE - 1)/BELL_GROUP_SIZE)*BELL_GROUP_SIZE; size_t globalsize[] = {gsize, dim2}; int data_align4 = data_align / 4; char kernelname[100] = "gpu_bell00_mad_tx"; kernelname[8] += bh; kernelname[9] += bw; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, kernelname, &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devColid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(int), &data_align4); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(int), &col_align); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(int), &b4ellnum); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(cl_mem), &devTexVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 6, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 7, sizeof(int), &blockrownum); CHECKERROR; errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); float* tmpresult = (float*)malloc(sizeof(float)*rownum); errorCode = clEnqueueReadBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, tmpresult, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); two_vec_compare(coores, tmpresult, rownum); free(tmpresult); for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nBELL %dx%d block mad tx cpu time %lf ms GFLOPS %lf code %d \n\n", bh, bw, time_in_sec / (double) ntimes * 1000, gflops, methodid); if (csrKernel) clReleaseKernel(csrKernel); double onetime = time_in_sec / (double) ntimes; if (onetime < opttime) { opttime = onetime; optmethod = methodid; } } //Clean up if (image2dVec) free(image2dVec); if (devColid) clReleaseMemObject(devColid); if (devData) clReleaseMemObject(devData); if (devVec) clReleaseMemObject(devVec); if (devTexVec) clReleaseMemObject(devTexVec); if (devRes) clReleaseMemObject(devRes); freeObjects(devices, &context, &cmdQueue, &program); }
int main(int argc, char **argv) { /* Host data */ float *hInputImage = NULL; float *hOutputImage = NULL; /* Angle for rotation (degrees) */ const float theta = 45.0f; /* Allocate space for the input image and read the * data from disk */ int imageRows; int imageCols; hInputImage = readBmpFloat("../../Images/cat-face.bmp", &imageRows, &imageCols); const int imageElements = imageRows*imageCols; const size_t imageSize = imageElements*sizeof(float); /* Allocate space for the output image */ hOutputImage = (float*)malloc(imageSize); if (!hOutputImage) { exit(-1); } /* Use this to check the output of each API call */ cl_int status; /* Get the first platform */ cl_platform_id platform; status = clGetPlatformIDs(1, &platform, NULL); check(status); /* Get the first device */ cl_device_id device; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); check(status); /* Create a context and associate it with the device */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &status); check(status); /* Create a command queue and associate it with the device */ cl_command_queue cmdQueue; cmdQueue = clCreateCommandQueue(context, device, 0, &status); check(status); /* The image descriptor describes how the data will be stored * in memory. This descriptor initializes a 2D image with no pitch */ cl_image_desc desc; desc.image_type = CL_MEM_OBJECT_IMAGE2D; desc.image_width = imageCols; desc.image_height = imageRows; desc.image_depth = 0; desc.image_array_size = 0; desc.image_row_pitch = 0; desc.image_slice_pitch = 0; desc.num_mip_levels = 0; desc.num_samples = 0; desc.buffer = NULL; /* The image format describes the properties of each pixel */ cl_image_format format; format.image_channel_order = CL_R; // single channel format.image_channel_data_type = CL_FLOAT; /* Create the input image and initialize it using a * pointer to the image data on the host. */ cl_mem inputImage = clCreateImage(context, CL_MEM_READ_ONLY, &format, &desc, NULL, NULL); /* Create the output image */ cl_mem outputImage = clCreateImage(context, CL_MEM_WRITE_ONLY, &format, &desc, NULL, NULL); /* Copy the host image data to the device */ size_t origin[3] = {0, 0, 0}; // Offset within the image to copy from size_t region[3] = {imageCols, imageRows, 1}; // Elements to per dimension clEnqueueWriteImage(cmdQueue, inputImage, CL_TRUE, origin, region, 0 /* row-pitch */, 0 /* slice-pitch */, hInputImage, 0, NULL, NULL); /* Create a program with source code */ char *programSource = readFile("image-rotation.cl"); size_t programSourceLen = strlen(programSource); cl_program program = clCreateProgramWithSource(context, 1, (const char**)&programSource, &programSourceLen, &status); check(status); /* Build (compile) the program for the device */ status = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (status != CL_SUCCESS) { printCompilerError(program, device); exit(-1); } /* Create the kernel */ cl_kernel kernel; kernel = clCreateKernel(program, "rotation", &status); check(status); /* Set the kernel arguments */ status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage); status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage); status |= clSetKernelArg(kernel, 2, sizeof(int), &imageCols); status |= clSetKernelArg(kernel, 3, sizeof(int), &imageRows); status |= clSetKernelArg(kernel, 4, sizeof(float), &theta); check(status); /* Define the index space and work-group size */ size_t globalWorkSize[2]; globalWorkSize[0] = imageCols; globalWorkSize[1] = imageRows; size_t localWorkSize[2]; localWorkSize[0] = 8; localWorkSize[1] = 8; /* Enqueue the kernel for execution */ status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); check(status); /* Read the output image buffer to the host */ status = clEnqueueReadImage(cmdQueue, outputImage, CL_TRUE, origin, region, 0 /* row-pitch */, 0 /* slice-pitch */, hOutputImage, 0, NULL, NULL); check(status); /* Write the output image to file */ writeBmpFloat(hOutputImage, "rotated-cat.bmp", imageRows, imageCols, "../../Images/cat-face.bmp"); /* Free OpenCL resources */ clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmdQueue); clReleaseMemObject(inputImage); clReleaseMemObject(outputImage); clReleaseContext(context); /* Free host resources */ free(hInputImage); free(hOutputImage); free(programSource); return 0; }
void spmv_sell_ocl(sell_matrix<int, float>* mat, float* vec, float* result, int dim2Size, double& opttime, double& optflop, int& optmethod, char* oclfilename, cl_device_type deviceType, int ntimes, double* floptable) { cl_device_id* devices = NULL; cl_context context = NULL; cl_command_queue cmdQueue = NULL; cl_program program = NULL; assert(initialization(deviceType, devices, &context, &cmdQueue, &program, oclfilename) == 1); cl_int errorCode = CL_SUCCESS; //Create device memory objects cl_mem devSlicePtr; cl_mem devColid; cl_mem devData; cl_mem devVec; cl_mem devRes; cl_mem devTexVec; //Initialize values int nnz = mat->matinfo.nnz; int rownum = mat->matinfo.height; int vecsize = mat->matinfo.width; int sliceheight = mat->sell_slice_height; int slicenum = mat->sell_slice_num; int datasize = mat->sell_slice_ptr[slicenum]; ALLOCATE_GPU_READ(devSlicePtr, mat->sell_slice_ptr, sizeof(int)*(slicenum + 1)); ALLOCATE_GPU_READ(devColid, mat->sell_col_id, sizeof(int)*datasize); ALLOCATE_GPU_READ(devData, mat->sell_data, sizeof(float)*datasize); ALLOCATE_GPU_READ(devVec, vec, sizeof(float)*vecsize); int paddedres = findPaddedSize(rownum, 512); devRes = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*paddedres, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; const cl_image_format floatFormat = { CL_R, CL_FLOAT, }; int width = VEC2DWIDTH; int height = (vecsize + VEC2DWIDTH - 1)/VEC2DWIDTH; float* image2dVec = (float*)malloc(sizeof(float)*width*height); memset(image2dVec, 0, sizeof(float)*width*height); for (int i = 0; i < vecsize; i++) { image2dVec[i] = vec[i]; } size_t origin[] = {0, 0, 0}; size_t vectorSize[] = {width, height, 1}; devTexVec = clCreateImage2D(context, CL_MEM_READ_ONLY, &floatFormat, width, height, 0, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteImage(cmdQueue, devTexVec, CL_TRUE, origin, vectorSize, 0, 0, image2dVec, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); //printf("\nvec length %d padded length %d", mat->matinfo.width, padveclength); int dim2 = dim2Size; if (sliceheight == WARPSIZE) { int methodid = 0; cl_uint work_dim = 2; size_t blocksize[] = {SELL_GROUP_SIZE, 1}; int gsize = ((rownum + SELL_GROUP_SIZE - 1)/SELL_GROUP_SIZE)*SELL_GROUP_SIZE; size_t globalsize[] = {gsize, dim2}; //printf("gsize %d rownum %d slicenum %d sliceheight %d datasize %d nnz %d vecsize %d \n", gsize, rownum, slicenum, sliceheight, datasize, nnz, vecsize); //int warpnum = SELL_GROUP_SIZE / WARPSIZE; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, "gpu_sell_warp", &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devSlicePtr); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devColid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(cl_mem), &devVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(int), &slicenum); CHECKERROR; for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nSELL cpu warp time %lf ms GFLOPS %lf code %d \n\n", time_in_sec / (double) ntimes * 1000, gflops, methodid); if (csrKernel) clReleaseKernel(csrKernel); double onetime = time_in_sec / (double) ntimes; floptable[methodid] = gflops; if (onetime < opttime) { opttime = onetime; optmethod = methodid; optflop = gflops; } } if (sliceheight == SELL_GROUP_SIZE) { int methodid = 1; cl_uint work_dim = 2; size_t blocksize[] = {SELL_GROUP_SIZE, 1}; int gsize = slicenum * SELL_GROUP_SIZE; size_t globalsize[] = {gsize, dim2}; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, "gpu_sell_group", &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devSlicePtr); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devColid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(cl_mem), &devVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(int), &slicenum); CHECKERROR; for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nSELL cpu group time %lf ms GFLOPS %lf code %d \n\n", time_in_sec / (double) ntimes * 1000, gflops, methodid); if (csrKernel) clReleaseKernel(csrKernel); double onetime = time_in_sec / (double) ntimes; floptable[methodid] = gflops; if (onetime < opttime) { opttime = onetime; optmethod = methodid; optflop = gflops; } } //Clean up if (image2dVec) free(image2dVec); if (devSlicePtr) clReleaseMemObject(devSlicePtr); if (devColid) clReleaseMemObject(devColid); if (devData) clReleaseMemObject(devData); if (devVec) clReleaseMemObject(devVec); if (devTexVec) clReleaseMemObject(devTexVec); if (devRes) clReleaseMemObject(devRes); freeObjects(devices, &context, &cmdQueue, &program); }
/** vglClUpload branch3d */ void vglClUpload(VglImage* img) { if (Interop && img->nChannels > 1) { vglClUploadInterop(img); } else { if (img->nChannels == 3) { fprintf(stderr, "%s: %s: Error: image with 3 channels not supported. Please convert to 4 channels.\n", __FILE__, __FUNCTION__); exit(1); } cl_int err; if ( !vglIsInContext(img, VGL_RAM_CONTEXT) && !vglIsInContext(img, VGL_BLANK_CONTEXT) ) { fprintf(stderr, "vglClUpload: Error: image context = %d not in VGL_RAM_CONTEXT or VGL_BLANK_CONTEXT\n", img->inContext); return; } if (img->oclPtr == NULL) { /*if (img->fbo != -1) { img->oclPtr = clCreateFromGLTexture2D(cl.context,CL_MEM_READ_WRITE,GL_TEXTURE_2D,0,img->fbo,&err); vglClCheckError( err, (char*) "clCreateFromGlTexture2D interop" ); clEnqueueAcquireGLObjects(cl.commandQueue, 1, &img->oclPtr, 0,0,0); } else {*/ cl_image_format format; if (img->nChannels == 1) { format.image_channel_order = CL_R; } else if (img->nChannels == 4) { format.image_channel_order = CL_RGBA; } if (img->depth == IPL_DEPTH_8U) { format.image_channel_data_type = CL_UNORM_INT8; } else if (img->depth == IPL_DEPTH_16U) { format.image_channel_data_type = CL_UNORM_INT16; } else if (img->depth == IPL_DEPTH_32S) { format.image_channel_data_type = CL_SIGNED_INT32; } else { fprintf(stderr, "%s: %s: Error: Unsupported image depth = %d.\n", __FILE__, __FUNCTION__, img->depth); format.image_channel_data_type = CL_UNORM_INT8; } if (img->ndim == 2) { img->oclPtr = clCreateImage2D(cl.context, CL_MEM_READ_WRITE, &format, img->shape[VGL_WIDTH], img->shape[VGL_HEIGHT], 0, NULL, &err); vglClCheckError( err, (char*) "clCreateImage2D" ); } else if(img->ndim == 3) { img->oclPtr = clCreateImage3D(cl.context, CL_MEM_READ_WRITE, &format, img->shape[VGL_WIDTH], img->shape[VGL_HEIGHT], img->shape[VGL_LENGTH], 0, 0, NULL, &err); vglClCheckError( err, (char*) "clCreateImage3D" ); } else { img->oclPtr = clCreateBuffer(cl.context, CL_MEM_READ_WRITE, img->getTotalSizeInBytes(), NULL, &err); vglClCheckError( err, (char*) "clCreateNDImage" ); } /* cl_image_desc desc; if (img->ndim == 2) { desc.image_type = CL_MEM_OBJECT_IMAGE2D; desc.image_width = img->shape[VGL_WIDTH]; desc.image_height = img->shape[VGL_HEIGHT]; desc.image_depth = 0; desc.image_array_size = 1; desc.image_row_pitch = 0; desc.image_slice_pitch = 0; desc.num_mip_levels = 0; desc.num_samples = 0; desc.buffer = NULL; } else { desc.image_type = CL_MEM_OBJECT_IMAGE3D; desc.image_width = img->shape[VGL_WIDTH]; desc.image_height = img->shape[VGL_HEIGHT]; desc.image_depth = img->shape[VGL_LENGTH]; desc.image_array_size = 0; desc.image_row_pitch = 0; desc.image_slice_pitch = 0; desc.num_mip_levels = 0; desc.num_samples = 0; desc.buffer = NULL; } img->oclPtr = clCreateImage(cl.context,CL_MEM_READ_WRITE, &format, &desc,NULL,&err); vglClCheckError(err, (char*) "clCreateImage"); */ } if (vglIsInContext(img, VGL_RAM_CONTEXT)) { size_t Origin[3] = { 0, 0, 0}; int nFrames = 1; if(img->ndim == 3) { nFrames = img->shape[VGL_LENGTH]; } void* imageData = img->getImageData(); if (!imageData) { fprintf(stderr, "%s: %s: Error: both ipl and ndarray are NULL.\n", __FILE__, __FUNCTION__); exit(1); } if ( (img->ndim == 2) || (img->ndim == 3) ) { size_t Size3d[3] = {img->shape[VGL_WIDTH], img->shape[VGL_HEIGHT], nFrames}; err = clEnqueueWriteImage( cl.commandQueue, img->oclPtr, CL_TRUE, Origin, Size3d, 0, 0, (char*)imageData, 0, NULL, NULL ); vglClCheckError( err, (char*) "clEnqueueWriteImage" ); clFinish(cl.commandQueue); } else { err = clEnqueueWriteBuffer(cl.commandQueue, img->oclPtr, CL_TRUE, 0, img->getTotalSizeInBytes(), imageData, 0, NULL, NULL); vglClCheckError( err, (char*) "clEnqueueWriteBuffer" ); clFinish(cl.commandQueue); } } vglAddContext(img, VGL_CL_CONTEXT); } }
int main() { cl_platform_id platform_id = NULL; cl_uint ret_num_platforms; cl_device_id device_id = NULL; cl_uint ret_num_devices; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_program program = NULL; cl_kernel kernel = NULL; size_t kernel_code_size; char *kernel_src_str; float *result; cl_int ret; int i; FILE *fp; size_t r_size; cl_mem image, out; cl_bool support; cl_image_format fmt; int num_out = 9; clGetPlatformIDs(1, &platform_id, &ret_num_platforms); clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); result = (float*)malloc(sizeof(cl_float4)*num_out); /* Check if the device support images */ clGetDeviceInfo(device_id, CL_DEVICE_IMAGE_SUPPORT, sizeof(support), &support, &r_size); if (support != CL_TRUE) { puts("image not supported"); return 1; } command_queue = clCreateCommandQueue(context, device_id, 0, &ret); fp = fopen("image.cl", "r"); kernel_src_str = (char*)malloc(MAX_SOURCE_SIZE); kernel_code_size = fread(kernel_src_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); /* Create output buffer */ out = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float4)*num_out, NULL, &ret); /* Create data format for image creation */ fmt.image_channel_order = CL_R; fmt.image_channel_data_type = CL_FLOAT; /* Create Image Object */ image = clCreateImage2D(context, CL_MEM_READ_ONLY, &fmt, 4, 4, 0, 0, NULL); /* Set parameter to be used to transfer image object */ size_t origin[] = {0, 0, 0}; /* Transfer target coordinate*/ size_t region[] = {4, 4, 1}; /* Size of object to be transferred */ float data[] = { /* Transfer Data */ 10, 20, 30, 40, 10, 20, 30, 40, 10, 20, 30, 40, 10, 20, 30, 40, }; /* Transfer to device */ clEnqueueWriteImage(command_queue, image, CL_TRUE, origin, region, 4*sizeof(float), 0, data, 0, NULL, NULL); /* Build program */ program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str, (const size_t *)&kernel_code_size, &ret); clBuildProgram(program, 1, &device_id, "", NULL, NULL); kernel = clCreateKernel(program, "image_test", &ret); /* Set Kernel Arguments */ clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&image); clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&out); cl_event ev; clEnqueueTask(command_queue, kernel, 0, NULL, &ev); /* Retrieve result */ clEnqueueReadBuffer(command_queue, out, CL_TRUE, 0, sizeof(cl_float4)*num_out, result, 0, NULL, NULL); for (i=0; i < num_out; i++) { printf("%f,%f,%f,%f\n",result[i*4+0],result[i*4+1],result[i*4+2],result[i*4+3]); } clReleaseMemObject(out); clReleaseMemObject(image); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(command_queue); clReleaseContext(context); free(kernel_src_str); free(result); return 0; }
END_TEST START_TEST (test_read_write_image) { cl_platform_id platform = 0; cl_device_id device; cl_context ctx; cl_command_queue queue; cl_mem image2d, part2d; cl_int result; unsigned char image2d_data_24bpp[3*3*4] = { 255, 0, 0, 0, 0, 255, 0, 0, 128, 128, 128, 0, 0, 0, 255, 0, 255, 255, 0, 0, 0, 128, 0, 0, 255, 128, 0, 0, 128, 0, 255, 0, 0, 0, 0, 0 }; unsigned char image2d_part_24bpp[2*2*4] = { 255, 0, 0, 0, 0, 255, 0, 0, 0, 0, 255, 0, 255, 255, 0, 0 }; unsigned char image2d_buffer[3*3*4]; unsigned char image2d_part[2*2*4]; cl_image_format fmt; fmt.image_channel_data_type = CL_UNORM_INT8; fmt.image_channel_order = CL_RGBA; size_t origin[3] = {0, 0, 0}; size_t region[3] = {3, 3, 1}; result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0); fail_if( result != CL_SUCCESS, "unable to get the default device" ); ctx = clCreateContext(0, 1, &device, 0, 0, &result); fail_if( result != CL_SUCCESS || ctx == 0, "unable to create a valid context" ); queue = clCreateCommandQueue(ctx, device, 0, &result); fail_if( result != CL_SUCCESS || queue == 0, "cannot create a command queue" ); image2d = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &fmt, 3, 3, 0, image2d_buffer, &result); fail_if( result != CL_SUCCESS || image2d == 0, "cannot create a valid 3x3 image2D" ); part2d = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &fmt, 2, 2, 0, image2d_part, &result); fail_if( result != CL_SUCCESS || image2d == 0, "cannot create a valid 2x2 image2D" ); // Write data in buffer result = clEnqueueWriteImage(queue, image2d, 1, origin, region, 0, 0, image2d_data_24bpp, 0, 0, 0); fail_if( result != CL_SUCCESS, "cannot enqueue a blocking write image event" ); // Read it back region[0] = 2; region[1] = 2; result = clEnqueueReadImage(queue, image2d, 1, origin, region, 0, 0, image2d_part, 0, 0, 0); fail_if( result != CL_SUCCESS, "cannot enqueue a blocking read image event" ); // Compare fail_if( std::memcmp(image2d_part, image2d_part_24bpp, sizeof(image2d_part)) != 0, "reading and writing images doesn't produce the correct result" ); // Read it back using a buffer cl_event event; std::memset(image2d_part, 0, sizeof(image2d_part)); result = clEnqueueCopyImage(queue, image2d, part2d, origin, origin, region, 0, 0, &event); fail_if( result != CL_SUCCESS, "unable to enqueue a copy image event" ); result = clWaitForEvents(1, &event); fail_if( result != CL_SUCCESS, "unable to wait for events" ); // Compare fail_if( std::memcmp(image2d_part, image2d_part_24bpp, sizeof(image2d_part)) != 0, "copying images doesn't produce the correct result" ); clReleaseEvent(event); clReleaseMemObject(part2d); clReleaseMemObject(image2d); clReleaseCommandQueue(queue); clReleaseContext(ctx); }