/// Enqueues a command to fill \p buffer with \p pattern. /// /// \see_opencl_ref{clEnqueueFillBuffer} /// /// \opencl_version_warning{1,2} /// /// \see fill() event enqueue_fill_buffer(const buffer &buffer, const void *pattern, size_t pattern_size, size_t offset, size_t size, const wait_list &events = wait_list()) { BOOST_ASSERT(m_queue != 0); BOOST_ASSERT(offset + size <= buffer.size()); BOOST_ASSERT(buffer.get_context() == this->get_context()); event event_; cl_int ret = clEnqueueFillBuffer( m_queue, buffer.get(), pattern, pattern_size, offset, size, events.size(), events.get_event_ptr(), &event_.get() ); if(ret != CL_SUCCESS){ BOOST_THROW_EXCEPTION(opencl_error(ret)); } return event_; }
JNIEXPORT jint JNICALL Java_org_lwjgl_opencl_CL12_nclEnqueueFillBuffer(JNIEnv *env, jclass clazz, jlong command_queue, jlong buffer, jlong pattern, jlong pattern_size, jlong offset, jlong size, jint num_events_in_wait_list, jlong event_wait_list, jlong event, jlong function_pointer) { const cl_void *pattern_address = (const cl_void *)(intptr_t)pattern; const cl_event *event_wait_list_address = (const cl_event *)(intptr_t)event_wait_list; cl_event *event_address = (cl_event *)(intptr_t)event; clEnqueueFillBufferPROC clEnqueueFillBuffer = (clEnqueueFillBufferPROC)((intptr_t)function_pointer); cl_int __result = clEnqueueFillBuffer((cl_command_queue)(intptr_t)command_queue, (cl_mem)(intptr_t)buffer, pattern_address, pattern_size, offset, size, num_events_in_wait_list, event_wait_list_address, event_address); return __result; }
cl_int zero_cl_mem(clctx_t *c, cl_mem buffer, size_t size) { cl_int ret; uint32_t z = 0; ret = clEnqueueFillBuffer (c->command_queue, buffer, &z, sizeof(z), 0, size, 0, NULL, NULL); CL_ERR_RET("clEnqueueFillBuffer (in zero_cl_mem)", ret); return ret; }
void oskar_mem_clear_contents(oskar_Mem* mem, int* status) { size_t size; /* Check if safe to proceed. */ if (*status) return; /* Compute the size. */ size = mem->num_elements * oskar_mem_element_size(mem->type); /* Clear the memory. */ if (mem->location == OSKAR_CPU) { memset(mem->data, 0, size); } else if (mem->location == OSKAR_GPU) { #ifdef OSKAR_HAVE_CUDA cudaMemset(mem->data, 0, size); #else *status = OSKAR_ERR_CUDA_NOT_AVAILABLE; #endif } else if (mem->location & OSKAR_CL) { #ifdef OSKAR_HAVE_OPENCL cl_event event; cl_int error; char zero = '\0'; error = clEnqueueFillBuffer(oskar_cl_command_queue(), mem->buffer, &zero, sizeof(char), 0, size, 0, NULL, &event); clWaitForEvents(1, &event); /* This is required. */ if (error != CL_SUCCESS) { fprintf(stderr, "clEnqueueFillBuffer() error (%d)\n", error); *status = OSKAR_ERR_INVALID_ARGUMENT; } #else *status = OSKAR_ERR_OPENCL_NOT_AVAILABLE; #endif } else { *status = OSKAR_ERR_BAD_LOCATION; } }
static void clwClearOutOrPart(int p_n_layers, int* p_n_neurons, std::vector<cl_mem> p_buffers) { clwInitLib(); char pattern = 0; cl_int res; for (int i=1; i<p_n_layers; i++) { res = clEnqueueFillBuffer(g_cl_Command_Queue, p_buffers[i], &pattern, 1, 0, sizeof(real) * (p_n_neurons[i]-1), 0, nullptr, nullptr); if (res != CL_SUCCESS) {std::cerr<<"clCrearOutOrPart :"<<res<<std::endl; getchar(); } } #ifdef FINISH res = clFinish(g_cl_Command_Queue); #else res = clEnqueueBarrierWithWaitList(g_cl_Command_Queue, 0, nullptr, nullptr); #endif if (res != CL_SUCCESS) {std::cerr<<"clCrearOutOrPart :"<<res<<std::endl; getchar(); } }
/*! \brief Clears the first natoms_clear elements of the GPU nonbonded force output array. */ static void nbnxn_ocl_clear_f(gmx_nbnxn_ocl_t *nb, int natoms_clear) { if (natoms_clear == 0) { return; } cl_int gmx_used_in_debug cl_error; cl_atomdata_t *atomData = nb->atdat; cl_command_queue ls = nb->stream[eintLocal]; cl_float value = 0.0f; cl_error = clEnqueueFillBuffer(ls, atomData->f, &value, sizeof(cl_float), 0, natoms_clear*sizeof(rvec), 0, nullptr, nullptr); GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("nbnxn_ocl_clear_f failed: " + ocl_get_error_string(cl_error)).c_str()); }
static int enqueue_zero_buffer(cl_command_queue queue, cl_mem buffer, size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event, cl_int *err) { cl_int _err; cl_uchar c = 0; if (!err) err = &_err; *err = clEnqueueFillBuffer(queue, (cl_mem)buffer, &c, sizeof(c), 0, size, num_events_in_wait_list, event_wait_list, event); CHECK_CL_ERROR(*err); return 0; error: return -1; }
int main(void) { cl_int err; cl_platform_id platforms[MAX_PLATFORMS]; cl_uint nplatforms; cl_device_id devices[MAX_DEVICES]; cl_uint ndevices; cl_uint i, j; size_t el, row, col; CHECK_CL_ERROR(clGetPlatformIDs(MAX_PLATFORMS, platforms, &nplatforms)); for (i = 0; i < nplatforms; i++) { CHECK_CL_ERROR(clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, MAX_DEVICES, devices, &ndevices)); /* Only test the devices we actually have room for */ if (ndevices > MAX_DEVICES) ndevices = MAX_DEVICES; for (j = 0; j < ndevices; j++) { /* skip devices that do not support images */ cl_bool has_img; CHECK_CL_ERROR(clGetDeviceInfo(devices[j], CL_DEVICE_IMAGE_SUPPORT, sizeof(has_img), &has_img, NULL)); if (!has_img) continue; cl_context context = clCreateContext(NULL, 1, &devices[j], NULL, NULL, &err); CHECK_OPENCL_ERROR_IN("clCreateContext"); cl_command_queue queue = clCreateCommandQueue(context, devices[j], 0, &err); CHECK_OPENCL_ERROR_IN("clCreateCommandQueue"); cl_ulong alloc; size_t max_height; size_t max_width; #define MAXALLOC (1024U*1024U) CHECK_CL_ERROR(clGetDeviceInfo(devices[j], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(alloc), &alloc, NULL)); CHECK_CL_ERROR(clGetDeviceInfo(devices[j], CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(max_width), &max_width, NULL)); CHECK_CL_ERROR(clGetDeviceInfo(devices[j], CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(max_height), &max_height, NULL)); while (alloc > MAXALLOC) alloc /= 2; // fit at least one max_width inside the alloc (shrink max_width for this) while (max_width*pixel_size > alloc) max_width /= 2; // round number of elements to next multiple of max_width elements const size_t nels = (alloc/pixel_size/max_width)*max_width; const size_t buf_size = nels*pixel_size; cl_image_desc img_desc; memset(&img_desc, 0, sizeof(img_desc)); img_desc.image_type = CL_MEM_OBJECT_IMAGE2D; img_desc.image_width = max_width; img_desc.image_height = nels/max_width; img_desc.image_depth = 1; cl_ushort null_pixel[4] = {0, 0, 0, 0}; cl_ushort *host_buf = malloc(buf_size); TEST_ASSERT(host_buf); for (el = 0; el < nels; el+=4) { host_buf[el] = el & CHANNEL_MAX; host_buf[el+1] = (CHANNEL_MAX - el) & CHANNEL_MAX; host_buf[el+2] = (CHANNEL_MAX/((el & 1) + 1) - el) & CHANNEL_MAX; host_buf[el+3] = (CHANNEL_MAX - el/((el & 1) + 1)) & CHANNEL_MAX; } cl_mem buf = clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err); CHECK_OPENCL_ERROR_IN("clCreateBuffer"); cl_mem img = clCreateImage(context, CL_MEM_READ_WRITE, &img_format, &img_desc, NULL, &err); CHECK_OPENCL_ERROR_IN("clCreateImage"); CHECK_CL_ERROR(clEnqueueWriteBuffer(queue, buf, CL_TRUE, 0, buf_size, host_buf, 0, NULL, NULL)); const size_t offset = 0; const size_t origin[] = {0, 0, 0}; const size_t region[] = {img_desc.image_width, img_desc.image_height, 1}; CHECK_CL_ERROR(clEnqueueCopyBufferToImage(queue, buf, img, offset, origin, region, 0, NULL, NULL)); size_t row_pitch, slice_pitch; cl_ushort *img_map = clEnqueueMapImage(queue, img, CL_TRUE, CL_MAP_READ, origin, region, &row_pitch, &slice_pitch, 0, NULL, NULL, &err); CHECK_OPENCL_ERROR_IN("clEnqueueMapImage"); CHECK_CL_ERROR(clFinish(queue)); for (row = 0; row < img_desc.image_height; ++row) { for (col = 0; col < img_desc.image_width; ++col) { cl_ushort *img_pixel = (cl_ushort*)((char*)img_map + row*row_pitch) + col*4; cl_ushort *buf_pixel = host_buf + (row*img_desc.image_width + col)*4; if (memcmp(img_pixel, buf_pixel, pixel_size) != 0) printf("%zu %zu %zu : %x %x %x %x | %x %x %x %x\n", row, col, (size_t)(buf_pixel - host_buf), buf_pixel[0], buf_pixel[1], buf_pixel[2], buf_pixel[3], img_pixel[0], img_pixel[1], img_pixel[2], img_pixel[3]); TEST_ASSERT(memcmp(img_pixel, buf_pixel, pixel_size) == 0); } } CHECK_CL_ERROR(clEnqueueUnmapMemObject(queue, img, img_map, 0, NULL, NULL)); /* Clear the buffer, and ensure it has been cleared */ CHECK_CL_ERROR(clEnqueueFillBuffer(queue, buf, null_pixel, sizeof(null_pixel), 0, buf_size, 0, NULL, NULL)); cl_ushort *buf_map = clEnqueueMapBuffer(queue, buf, CL_TRUE, CL_MAP_READ, 0, buf_size, 0, NULL, NULL, &err); CHECK_OPENCL_ERROR_IN("clEnqueueMapBuffer"); CHECK_CL_ERROR(clFinish(queue)); for (el = 0; el < nels; ++el) { #if 0 // debug if (buf_map[el] != 0) { printf("%zu/%zu => %u\n", el, nels, buf_map[el]); } #endif TEST_ASSERT(buf_map[el] == 0); } CHECK_CL_ERROR(clEnqueueUnmapMemObject(queue, buf, buf_map, 0, NULL, NULL)); /* Copy data from image to buffer, and check that it's again equal to the original buffer */ CHECK_CL_ERROR(clEnqueueCopyImageToBuffer(queue, img, buf, origin, region, offset, 0, NULL, NULL)); buf_map = clEnqueueMapBuffer(queue, buf, CL_TRUE, CL_MAP_READ, 0, buf_size, 0, NULL, NULL, &err); CHECK_CL_ERROR(clFinish(queue)); TEST_ASSERT(memcmp(buf_map, host_buf, buf_size) == 0); CHECK_CL_ERROR ( clEnqueueUnmapMemObject (queue, buf, buf_map, 0, NULL, NULL)); CHECK_CL_ERROR (clFinish (queue)); free(host_buf); CHECK_CL_ERROR (clReleaseMemObject (img)); CHECK_CL_ERROR (clReleaseMemObject (buf)); CHECK_CL_ERROR (clReleaseCommandQueue (queue)); CHECK_CL_ERROR (clReleaseContext (context)); } } return EXIT_SUCCESS; }
void LSHReservoirSampler::initHelper(int numTablesIn, int numHashPerFamilyIn, int reservoriSizeIn) { /* Reservoir Random Number. */ std::cout << "[LSHReservoirSampler::initHelper] Generating random number for reservoir sampling ..." << std::endl; std::default_random_engine generator1; std::uniform_int_distribution<unsigned int> distribution_a(0, 0x7FFFFFFF); _sechash_a = distribution_a(generator1) * 2 + 1; std::uniform_int_distribution<unsigned int> distribution_b(0, 0xFFFFFFFF >> _numSecHash); _sechash_b = distribution_b(generator1); _global_rand = new unsigned int[_maxReservoirRand]; for (unsigned int i = 0; i < _maxReservoirRand; i++) { std::uniform_int_distribution<unsigned int> distribution(0, i); _global_rand[i] = distribution(generator1); } #if defined OPENCL_HASHTABLE _globalRand_obj = clCreateBuffer(context_gpu, CL_MEM_READ_WRITE, _maxReservoirRand * sizeof(unsigned int), NULL, &_err); _err = clEnqueueWriteBuffer(command_queue_gpu, _globalRand_obj, CL_TRUE, 0, _maxReservoirRand * sizeof(unsigned int), _global_rand, 0, NULL, NULL); #endif std::cout << "Completed. " << std::endl; /* Hash tables. */ _tableMemReservoirMax = (_numTables - 1) * _aggNumReservoirs + _numReservoirsHashed; _tableMemMax = _tableMemReservoirMax * (1 + _reservoirSize); _tablePointerMax = _numTables * _numReservoirsHashed; #if defined OPENCL_HASHTABLE std::cout << "Initializing GPU-OpenCL tables and pointers ... " << std::endl; _tableMem_obj = clCreateBuffer(context_gpu, CL_MEM_READ_WRITE, _tableMemMax * sizeof(unsigned int), NULL, &_err); clCheckError(_err, "[initHelper] Failed to alloc GPU _tableMem_obj."); _err = clEnqueueFillBuffer(command_queue_gpu, _tableMem_obj, &_zero, sizeof(const int), 0, _tableMemMax * sizeof(unsigned int), 0, NULL, NULL); clCheckError(_err, "[initHelper] Failed to init GPU _tableMem_obj."); _tableMemAllocator_obj = clCreateBuffer(context_gpu, CL_MEM_READ_WRITE, _numTables * sizeof(unsigned int), NULL, &_err); clCheckError(_err, "[initHelper] Failed to alloc GPU _tableMemAllocator_obj."); _err = clEnqueueFillBuffer(command_queue_gpu, _tableMemAllocator_obj, &_zero, sizeof(const int), 0, _numTables * sizeof(unsigned int), 0, NULL, NULL); clCheckError(_err, "[initHelper] Failed to init GPU _tableMemAllocator_obj."); _tablePointers_obj = clCreateBuffer(context_gpu, CL_MEM_READ_WRITE, _tablePointerMax * sizeof(unsigned int), NULL, &_err); clCheckError(_err, "[initHelper] Failed to alloc GPU _tablePointers_obj."); _err = clEnqueueFillBuffer(command_queue_gpu, _tablePointers_obj, &_tableNull, sizeof(const int), 0, _tablePointerMax * sizeof(unsigned int), 0, NULL, NULL); clCheckError(_err, "[initHelper] Failed to init GPU _tablePointers_obj."); clFinish(command_queue_gpu); std::cout << "Completed. \n"; #elif defined CPU_TB std::cout << "Initializing CPU tables and pointers ... " << std::endl; _tableMem = new unsigned int[_tableMemMax](); _tableMemAllocator = new unsigned int[_numTables](); _tablePointers = new unsigned int[_tablePointerMax]; _tablePointersLock = new omp_lock_t[_tablePointerMax]; std::cout << "Completed. " << std::endl; std::cout << "Initializing CPU tablePointers/Locks ... " << std::endl; for (unsigned long long i = 0; i < _tablePointerMax; i++) { _tablePointers[i] = TABLENULL; omp_init_lock(_tablePointersLock + i); } std::cout << "Completed. " << std::endl; std::cout << "Initializing CPU tableCountersLocks ... " << std::endl; _tableCountersLock = new omp_lock_t[_tableMemReservoirMax]; for (unsigned long long i = 0; i < _tableMemReservoirMax; i++) { omp_init_lock(_tableCountersLock + i); } std::cout << "Completed. " << std::endl; #endif /* Hashing counter. */ _sequentialIDCounter_kernel = 0; }
int main(int argc, char **argv) { int iter; int total_number_sequences = 0; int total_number_targets = 0; int total_alignments = 0; int sequence_index_start = 0; int sequence_index_end = 0; int target_index_start = 0; int target_index_end = 0; #ifdef PROFILING struct timeval startt, endt, startttotal, endttotal, starttiter, endtiter, timer_iter_total, timer_mm, timer_init, timer_H2D, timer_D2H, timer_kernel1, timer_kernel2, timer_plotAlignments, timer_kernel_builder, timer_total; struct timeval timer_iter_total_array[ITERATIONS]; struct timeval timer_mm_array[ITERATIONS]; struct timeval timer_init_array[ITERATIONS]; struct timeval timer_H2D_array[ITERATIONS]; struct timeval timer_D2H_array[ITERATIONS]; struct timeval timer_kernel1_array[ITERATIONS]; struct timeval timer_kernel2_array[ITERATIONS]; struct timeval timer_plotAlignments_array[ITERATIONS]; struct timeval timer_kernel_builder_array[ITERATIONS]; timer_total.tv_usec = 0; timer_total.tv_sec = 0; #endif #ifdef PROFILING gettimeofday(&startttotal, NULL); #endif for(iter=0; iter<ITERATIONS; iter++) { total_number_sequences = 0; total_number_targets = 0; #ifdef PROFILING timer_iter_total.tv_usec = 0; timer_iter_total.tv_sec = 0; timer_mm.tv_usec = 0; timer_mm.tv_sec = 0; timer_init.tv_usec = 0; timer_init.tv_sec = 0; timer_H2D.tv_usec = 0; timer_H2D.tv_sec = 0; timer_D2H.tv_usec = 0; timer_D2H.tv_sec = 0; timer_kernel1.tv_usec = 0; timer_kernel1.tv_sec = 0; timer_kernel2.tv_usec = 0; timer_kernel2.tv_sec = 0; timer_plotAlignments.tv_usec = 0; timer_plotAlignments.tv_sec = 0; timer_kernel_builder.tv_usec = 0; timer_kernel_builder.tv_sec = 0; #endif #ifdef PROFILING gettimeofday(&starttiter, NULL); #endif /** We do not have 11 arguments **/ if (argc != 10) { fprintf(stderr,"Error: use: ./paswas_opencl <sequenceFile> <targetFile> <superBlocksX> <superBlocksY> <sequence_index_start> <sequence_index_end> <target_index_start> <target_index_end> <performanceFileLoc>!\n"); exit(EXIT_FAILURE); } cl_platform_id platforms = NULL; cl_uint ret_num_platforms = 0; cl_uint ret_num_devices = 0; cl_device_id devices = NULL; cl_command_queue queue = NULL; cl_context context = NULL; cl_program program = NULL; cl_kernel kernel_calculateScore = NULL; cl_kernel kernel_traceback = NULL; cl_int error_check; error_check = clGetPlatformIDs(1, &platforms, &ret_num_platforms); if(error_check != CL_SUCCESS) { fprintf(stderr,"Could not find a valid OpenCL platform\n"); exit(EXIT_FAILURE); } sequence_index_start = atoi(argv[5]); if(sequence_index_start<0){ fprintf(stderr,"Please provide a valid sequence start index:%d\n",sequence_index_start); exit(EXIT_FAILURE); } fprintf(stderr,"sequence_index_start:%d\n",sequence_index_start); sequence_index_end = atoi(argv[6]); if(sequence_index_end<=sequence_index_start){ fprintf(stderr,"Please provide a valid sequence end index:%d\n",sequence_index_end); exit(EXIT_FAILURE); } fprintf(stderr,"sequence_index_end:%d\n",sequence_index_end); target_index_start = atoi(argv[7]); if(target_index_start<0){ fprintf(stderr,"Please provide a valid target start index:%d\n",target_index_start); exit(EXIT_FAILURE); } fprintf(stderr,"target_index_start:%d\n",target_index_start); target_index_end = atoi(argv[8]); if(target_index_end<=target_index_start){ fprintf(stderr,"Please provide a valid target end index:%d\n",target_index_end); exit(EXIT_FAILURE); } fprintf(stderr,"target_index_end:%d\n",target_index_end); #ifdef NVIDIA error_check = clGetDeviceIDs(platforms, CL_DEVICE_TYPE_GPU, 1, &devices, &ret_num_devices); #endif #ifdef AMD_GPU error_check = clGetDeviceIDs(platforms, CL_DEVICE_TYPE_GPU, 1, &devices, &ret_num_devices); #endif #ifdef AMD_CPU error_check = clGetDeviceIDs(platforms, CL_DEVICE_TYPE_CPU, 1, &devices, &ret_num_devices); #endif #ifdef INTEL error_check = clGetDeviceIDs(platforms, CL_DEVICE_TYPE_CPU, 1, &devices, &ret_num_devices); #endif if(error_check != CL_SUCCESS) { fprintf(stderr,"No OpenCL devices found\n"); exit(EXIT_FAILURE); } cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platforms, 0 }; context = clCreateContext(properties, 1, &devices, &pfn_notify, NULL, &error_check); if(error_check != CL_SUCCESS) { fprintf(stderr,"Context could not be created\n"); exit(EXIT_FAILURE); } queue = clCreateCommandQueue(context, devices, 0, &error_check); if(error_check != CL_SUCCESS) { fprintf(stderr,"Command queue could not be created"); exit(EXIT_FAILURE); } dimensions superBlocks; superBlocks.x = atoi(argv[3]); superBlocks.y = atoi(argv[4]); if(!superBlocks.x || !superBlocks.y) { fprintf(stderr,"Please provide integer values for superblock_x or superblock_y\n"); } superBlocks.z = 0; // variables needed for the application: fprintf(stderr,"Superblocksx: %d\tSuperblocksy:%d\n",superBlocks.x,superBlocks.y); fprintf(stderr,"#SEQUENCES: %d\t#TARGET:%d\n",NUMBER_SEQUENCES,NUMBER_TARGETS); #ifdef PROFILING gettimeofday(&startt, NULL); #endif char *descSequences = (char *) malloc(sizeof(char) * superBlocks.x*NUMBER_SEQUENCES*MAX_LINE_LENGTH); char *descTargets = (char *) malloc(sizeof(char) * superBlocks.y* NUMBER_TARGETS*MAX_LINE_LENGTH); descSequences[0] = '\0'; descTargets[0] = '\0'; #ifdef PROFILING gettimeofday(&endt, NULL); timer_mm.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec); #endif #ifdef PROFILING gettimeofday(&startt, NULL); #endif float h_scoringsMatrix[SCORINGS_MAT_SIZE*SCORINGS_MAT_SIZE] = {0}; fillScoringsMatrix(h_scoringsMatrix); #ifdef PROFILING gettimeofday(&endt, NULL); timer_init.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec); #endif int min_align; error_check = clGetDeviceInfo(devices, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(int), &min_align, NULL); if(error_check != CL_SUCCESS) { fprintf(stderr,"GetDeviceInfo failed"); exit(EXIT_FAILURE); } fprintf(stderr, "ALIGN = %d\n", min_align); #ifdef PROFILING gettimeofday(&startt, NULL); #endif #ifdef NO_ZERO_COPY char *h_sequences = 0, *h_targets = 0; cl_mem d_sequences, d_targets, d_matrix, d_globalMaxima, d_globalDirection; cl_mem d_indexIncrement, d_scoringsMatrix; GlobalMatrix *h_matrix = 0; GlobalMaxima *h_globalMaxima = 0; GlobalDirection *h_globalDirectionZeroCopy = 0; StartingPoints *h_startingPointsZeroCopy = 0; float *h_maxPossibleScoreZeroCopy = 0; cl_mem d_startingPointsZeroCopy, d_maxPossibleScoreZeroCopy; init(&h_sequences, &h_targets, &d_sequences, &d_targets, &h_matrix, &d_matrix, &h_globalMaxima, &d_globalMaxima, &d_globalDirection, &h_globalDirectionZeroCopy, &d_startingPointsZeroCopy, &h_startingPointsZeroCopy, &d_maxPossibleScoreZeroCopy, &h_maxPossibleScoreZeroCopy, &d_scoringsMatrix, &d_indexIncrement, superBlocks, context, queue, error_check); #endif /**Data Structure Initialization with NVIDIA_zero_copy **/ #ifdef NVIDIA_ZERO_COPY char *h_sequences = 0, *h_targets = 0; cl_mem d_sequences, d_targets, d_matrix, d_globalMaxima, d_globalDirection; cl_mem d_indexIncrement, d_scoringsMatrix; GlobalMatrix *h_matrix = 0; GlobalMaxima *h_globalMaxima = 0; StartingPoints *h_startingPointsZeroCopy = 0; float *h_maxPossibleScoreZeroCopy = 0; GlobalDirection *h_globalDirectionZeroCopy = 0; cl_mem pinned_startingPointsZeroCopy, pinned_maxPossibleScoreZeroCopy, pinned_globalDirectionZeroCopy; cl_mem d_startingPointsZeroCopy, d_maxPossibleScoreZeroCopy; // allocate memory on host & device: init_zc(&h_sequences, &h_targets, &d_sequences, &d_targets, &h_matrix, &d_matrix, &h_globalMaxima, &d_globalMaxima, &d_startingPointsZeroCopy, &h_startingPointsZeroCopy, &pinned_startingPointsZeroCopy, &d_maxPossibleScoreZeroCopy, &h_maxPossibleScoreZeroCopy, &pinned_maxPossibleScoreZeroCopy, &d_globalDirection, &h_globalDirectionZeroCopy, &pinned_globalDirectionZeroCopy, &d_scoringsMatrix, &d_indexIncrement, superBlocks, context, queue, error_check); #endif /**Data Structure Initialization with INTEL_CPU_zero_copy **/ #ifdef INTEL_ZERO_COPY char *h_sequences = 0, *h_targets = 0; cl_mem d_sequences, d_targets, d_matrix, d_globalMaxima, d_globalDirection; cl_mem d_indexIncrement, d_scoringsMatrix; GlobalMatrix *h_matrix = 0; GlobalMaxima *h_globalMaxima = 0; GlobalDirection *h_globalDirectionZeroCopy = 0; StartingPoints *h_startingPointsZeroCopy = 0; float *h_maxPossibleScoreZeroCopy = 0; cl_mem d_startingPointsZeroCopy, d_maxPossibleScoreZeroCopy; /**Zero copy variables base address **/ StartingPoints* startingPointsData = (StartingPoints*)memalign(min_align/8, sizeof(StartingPoints)); float* maxPossibleScoreData = (float*)memalign(min_align/8, sizeof(float) * NUMBER_SEQUENCES * superBlocks.x); GlobalDirection* globalDirectionData = (GlobalDirection*)memalign(min_align/8, sizeof(GlobalDirection)); // allocate memory on host & device: init_zc_CPU(&h_sequences, &h_targets, &d_sequences, &d_targets, &h_matrix, &d_matrix, &h_globalMaxima, &d_globalMaxima, &d_startingPointsZeroCopy, &startingPointsData, &d_maxPossibleScoreZeroCopy, &maxPossibleScoreData, &d_globalDirection, &globalDirectionData, &d_scoringsMatrix, &d_indexIncrement, superBlocks, context, queue, error_check); #endif #ifdef PROFILING gettimeofday(&endt, NULL); timer_mm.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec); #endif /**Fill GlobalMatrix with 0's, feature is only available in OpenCL 1.2, however this is not needed for the shared memory case**/ #ifdef PROFILING gettimeofday(&startt, NULL); #endif #if (defined(INTEL) || defined(AMD)) && (defined(GLOBAL_MEM4)) float zero_pattern = 0.0; error_check = clEnqueueFillBuffer(queue, d_matrix, &zero_pattern, sizeof(float), 0, sizeof(GlobalMatrix), 0, NULL, NULL); if(error_check != CL_SUCCESS) { fprintf(stderr,"Failed to initialize GlobalMatrix to zero\n"); exit(EXIT_FAILURE); } clFinish(queue); #endif #ifdef PROFILING gettimeofday(&endt, NULL); timer_mm.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec); #endif #ifdef PROFILING gettimeofday(&startt, NULL); #endif #ifdef GLOBAL_MEM4 GlobalSemaphores *h_semaphore = (GlobalSemaphores*) malloc(sizeof(GlobalSemaphores)); cl_mem d_semaphore = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(GlobalSemaphores), NULL, &error_check); if(error_check != CL_SUCCESS) { fprintf(stderr,"Failed to create semaphore on device\n"); exit(EXIT_FAILURE); } /** Initialize device buffer to all zeroes **/ initSemaphor(&d_semaphore, context, queue, error_check); #endif #ifdef PROFILING gettimeofday(&endt, NULL); timer_mm.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec); #endif /* Write scoringsMatrix to device buffer*/ #ifdef PROFILING gettimeofday(&startt, NULL); #endif error_check = clEnqueueWriteBuffer(queue, d_scoringsMatrix, CL_TRUE, 0, sizeof(float) * 26 * 26, h_scoringsMatrix, 0, NULL, NULL); #ifdef PROFILING gettimeofday(&endt, NULL); timer_H2D.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec); #endif if(error_check != CL_SUCCESS) { fprintf(stderr,"Failed to write scoringsMatrix to device buffer"); exit(EXIT_FAILURE); } gzFile targetFile, seqFile; kseq_t *target, *seq; targetFile = gzopen(argv[2], "r"); seqFile = gzopen(argv[1], "r"); if (!targetFile || !seqFile) { fprintf(stderr,"Error: could not open target/seq file!\n"); return 1; } #ifdef INTEL_ZERO_COPY #ifdef PROFILING gettimeofday(&startt, NULL); #endif h_maxPossibleScoreZeroCopy = (float *)clEnqueueMapBuffer(queue, d_maxPossibleScoreZeroCopy, CL_TRUE, CL_MAP_WRITE, 0, sizeof(float) * NUMBER_SEQUENCES * superBlocks.x, 0, NULL, NULL, &error_check); clFinish(queue); #ifdef PROFILING gettimeofday(&endt, NULL); timer_H2D.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec); #endif if(error_check != CL_SUCCESS) { fprintf(stderr, "clEnqueueMap CL_MAP_WRITE h_maxPossibleScoreZeroCopy => %d\n", error_check); exit(EXIT_FAILURE); } #endif int t=target_index_start; int s=sequence_index_start; int target_offset = 0; int sequence_offset = 0; int l; #ifdef PROFILING gettimeofday(&startt, NULL); #endif target = kseq_init(targetFile); int skip=0; //Skip sequences which we do not need while(skip<target_index_start) { kseq_read(target); skip++; } while ((l = kseq_read(target)) >= 0 && t<target_index_end) { char * current = h_targets+(target_offset*(unsigned int)Y); char * currentDesc = descTargets + (target_offset*MAX_LINE_LENGTH); current[0] = '\0'; currentDesc[0] = '\0'; char * description = (char*) malloc( sizeof(char) * (target->name.l + target->comment.l + target->qual.l + 3)); description[0] = '\0'; strcpy(description, target->name.s); if(target->comment.l) { strcat(description, " "); strcat(description, target->comment.s); } if(target->qual.l) { strcat(description, " "); strcat(description, target->qual.s); } if(strlen(description)>=MAX_LINE_LENGTH) { description[MAX_LINE_LENGTH-1] = '\0'; } strcpy(currentDesc,description); free(description); if(target->seq.l > Y) { fprintf(stderr, "Error: target read too long!\n"); fprintf(stderr, "id: %s\n", currentDesc); fprintf(stderr, "Y:%d\n", Y); fprintf(stderr, "target_file:%s\n",argv[2]); return 1; } strncpy(current, target->seq.s, target->seq.l); int i=0; for (; i < target->seq.l; i++) { current[i] = toupper(current[i]); if (current[i] - 'A' < 0 || current[i] - 'A' > 25) { fprintf(stderr, "Error: wrong character in target file: '%c', desc: %s\n", current[i], current); return 1; } } for(; i < Y; i++) { current[i] = FILL_CHARACTER; } t++; target_offset++; } fprintf(stderr, "Read number of targets: %d from %s\n", target_offset,argv[2]); total_number_targets = target_offset; for (; target_offset < superBlocks.y * NUMBER_TARGETS; target_offset++) { for (int c=0; c < Y; c++) *(h_targets+target_offset*Y+c) = FILL_CHARACTER; } seq = kseq_init(seqFile); skip=0; //Skip sequences which we do not need while(skip<sequence_index_start) { kseq_read(seq); skip++; } while ((l = kseq_read(seq)) >= 0 && s<sequence_index_end) { h_maxPossibleScoreZeroCopy[sequence_offset] = 0; char * current = h_sequences +(sequence_offset*(unsigned int)X); char * currentDesc = descSequences + (sequence_offset*MAX_LINE_LENGTH); current[0] = '\0'; currentDesc[0] = '\0'; char * description = (char*) malloc( sizeof(char) * (seq->name.l + seq->comment.l + seq->qual.l + 3)); description[0] = '\0'; strcpy(description, seq->name.s); if(seq->comment.l) { strcat(description, " "); strcat(description, seq->comment.s); } if(seq->qual.l) { strcat(description, " "); strcat(description, seq->qual.s); } if(strlen(description)>=MAX_LINE_LENGTH) { description[MAX_LINE_LENGTH-1] = '\0'; } strcpy(currentDesc,description); free(description); if(seq->seq.l > X) { fprintf(stderr, "Error: sequence read too long!\n"); fprintf(stderr, "id: %s\n", currentDesc); fprintf(stderr, "X:%d\n", X); fprintf(stderr, "sequence_file:%s\n",argv[1]); return 1; } strncpy(current, seq->seq.s, seq->seq.l); int i=0; for (; i < seq->seq.l; i++) { current[i] = toupper(current[i]); h_maxPossibleScoreZeroCopy[sequence_offset] += HIGHEST_SCORE; if (current[i] - 'A' < 0 || current[i] - 'A' > 25) { fprintf(stderr, "Error: wrong character in seq file: '%c', desc: %s\n", current[i], current); return 1; } } for(; i < X; i++) { current[i] = FILL_CHARACTER; } h_maxPossibleScoreZeroCopy[sequence_offset] *= LOWER_LIMIT_MAX_SCORE; s++; sequence_offset++; } fprintf(stderr, "Read number of sequences: %d from %s\n", sequence_offset, argv[1]); total_number_sequences = sequence_offset; int count = 0; for (; sequence_offset < superBlocks.x * NUMBER_SEQUENCES; sequence_offset++) { count++; h_maxPossibleScoreZeroCopy[sequence_offset] = 0; for (int c=0; c < X; c++) *(h_sequences+sequence_offset*X+c) = FILL_CHARACTER; } kseq_destroy(target); gzclose(targetFile); kseq_destroy(seq); gzclose(seqFile); #ifdef PROFILING gettimeofday(&endt, NULL); timer_init.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec); #endif /* Write maxPossibleScoreZeroCopy to device buffer*/ #ifdef PROFILING gettimeofday(&startt, NULL); #endif #if defined(NO_ZERO_COPY) || defined(NVIDIA_ZERO_COPY) error_check = clEnqueueWriteBuffer(queue, d_maxPossibleScoreZeroCopy, CL_TRUE, 0, sizeof(float) * NUMBER_SEQUENCES * superBlocks.x, h_maxPossibleScoreZeroCopy, 0, NULL, NULL); #endif #ifdef INTEL_ZERO_COPY error_check = clEnqueueUnmapMemObject(queue, d_maxPossibleScoreZeroCopy, h_maxPossibleScoreZeroCopy, 0, NULL, NULL); #endif clFinish(queue); #ifdef PROFILING gettimeofday(&endt, NULL); timer_H2D.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec); #endif if(error_check != CL_SUCCESS) { fprintf(stderr,"Failed to write maxPossibleScoreZeroCopy to buffer"); exit(EXIT_FAILURE); } #ifdef PROFILING gettimeofday(&startt, NULL); #endif FILE *fp; size_t kernel_size; char *source; fp = fopen("kernel/smithwaterman_kern.cl", "rb"); if(fp) { /* Set the file pointer to the end of the file */ fseek(fp, 0, SEEK_END); kernel_size = ftell(fp); /* Set the file pointer back to the beginning of the file */ fseek(fp, 0, SEEK_SET); source = (char*)malloc(kernel_size+1); fread(source, 1, kernel_size, fp); source[kernel_size] = '\0'; fclose(fp); } else { fprintf(stderr,"Could not read from: kernel/smithwaterman_kern.cl\n"); exit(EXIT_FAILURE); } program = clCreateProgramWithSource(context, 1, (const char **)&source, (const size_t *)&kernel_size, &error_check); if (error_check != CL_SUCCESS) { fprintf(stderr,"Could not create a program object with the provided source code"); exit(EXIT_FAILURE); } if(source){ free(source); } /**TODO Determine size of directives dynamically**/ int size_directives = 2048; char *directives = (char *) malloc(sizeof(char) * size_directives); #ifdef GLOBAL_MEM4 #ifdef NVIDIA sprintf(directives, "-DNVIDIA -DGLOBAL_MEM4 -DNUMBER_SEQUENCES=%d -DNUMBER_TARGETS=%d -DX=%d -DY=%d -DMINIMUM_SCORE=%f -DSHARED_X=%d -DSHARED_Y=%d -DWORKLOAD_X=%d -DWORKLOAD_Y=%d",NUMBER_SEQUENCES,NUMBER_TARGETS,X,Y,MINIMUM_SCORE,SHARED_X,SHARED_Y, WORKLOAD_X, WORKLOAD_Y); #else sprintf(directives, "-DGLOBAL_MEM4 -DNUMBER_SEQUENCES=%d -DNUMBER_TARGETS=%d -DX=%d -DY=%d -DMINIMUM_SCORE=%f -DSHARED_X=%d -DSHARED_Y=%d -DWORKLOAD_X=%d -DWORKLOAD_Y=%d",NUMBER_SEQUENCES,NUMBER_TARGETS,X,Y,MINIMUM_SCORE,SHARED_X,SHARED_Y, WORKLOAD_X, WORKLOAD_Y); #endif #endif #ifdef SHARED_MEM #ifdef NVIDIA sprintf(directives, "-DNVIDIA -DSHARED_MEM -DNUMBER_SEQUENCES=%d -DNUMBER_TARGETS=%d -DX=%d -DY=%d -DMINIMUM_SCORE=%f -DSHARED_X=%d -DSHARED_Y=%d",NUMBER_SEQUENCES,NUMBER_TARGETS,X,Y,MINIMUM_SCORE,SHARED_X,SHARED_Y); #else sprintf(directives, "-DSHARED_MEM -DNUMBER_SEQUENCES=%d -DNUMBER_TARGETS=%d -DX=%d -DY=%d -DMINIMUM_SCORE=%f -DSHARED_X=%d -DSHARED_Y=%d",NUMBER_SEQUENCES,NUMBER_TARGETS,X,Y,MINIMUM_SCORE,SHARED_X,SHARED_Y); #endif #endif error_check = clBuildProgram(program, 1, &devices,directives, NULL, NULL); free(directives); if (error_check != CL_SUCCESS) { size_t length; clGetProgramBuildInfo(program, devices, CL_PROGRAM_BUILD_LOG, 0, NULL, &length); char* buffer = (char*)malloc(length+1); clGetProgramBuildInfo(program, devices, CL_PROGRAM_BUILD_LOG, length, buffer, NULL); buffer[length] ='\0'; fprintf(stderr,"Error: Failed to build program executable!\n"); printf("%s\n", buffer); free(buffer); exit(EXIT_FAILURE); } #ifdef PROFILING gettimeofday(&endt, NULL); timer_kernel_builder.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec); #endif total_alignments = 0; for (int i=0; i < superBlocks.y; i++) { #ifdef PROFILING gettimeofday(&startt, NULL); #endif error_check = clEnqueueWriteBuffer(queue, d_targets, CL_TRUE, 0, sizeof(char) * Y*NUMBER_TARGETS, h_targets+(i*NUMBER_TARGETS*Y), 0, NULL, NULL); clFinish(queue); #ifdef PROFILING gettimeofday(&endt, NULL); timer_H2D.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec); #endif if(error_check != CL_SUCCESS) { fprintf(stderr,"Failed to write target data to buffer"); exit(EXIT_FAILURE); } for (int j=0;j<superBlocks.x;j++) { // copy sequences to the device: #ifdef PROFILING gettimeofday(&startt, NULL); #endif error_check = clEnqueueWriteBuffer(queue, d_sequences, CL_TRUE, 0, sizeof(char)*X*NUMBER_SEQUENCES, h_sequences+(j*NUMBER_SEQUENCES*X), 0, NULL, NULL); clFinish(queue); #ifdef PROFILING gettimeofday(&endt, NULL); timer_H2D.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec); #endif if(error_check != CL_SUCCESS) { fprintf(stderr,"Failed to write sequence data to buffer"); exit(EXIT_FAILURE); } // make sure database-type index is reset: #ifdef PROFILING gettimeofday(&startt, NULL); #endif initZeroCopy(&d_indexIncrement, context, queue, error_check); #ifdef PROFILING gettimeofday(&endt, NULL); timer_H2D.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec); #endif // fill the scorings matrix: kernel_calculateScore = clCreateKernel(program, "calculateScore", &error_check); if(error_check != CL_SUCCESS) { fprintf(stderr,"Failed to create calculate score kernel"); exit(EXIT_FAILURE); } #ifdef PROFILING gettimeofday(&startt, NULL); #endif calculateScoreHost(d_matrix, d_sequences, d_targets, d_globalMaxima, d_globalDirection, d_scoringsMatrix, kernel_calculateScore, queue, error_check); #ifdef PROFILING gettimeofday(&endt, NULL); timer_kernel1.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec); #endif // create tracebacks and copy information through zero copy to the host: kernel_traceback = clCreateKernel(program, "traceback", &error_check); if(error_check != CL_SUCCESS) { fprintf(stderr,"Failed to create calculate traceback kernel"); exit(EXIT_FAILURE); } #ifdef PROFILING gettimeofday(&startt, NULL); #endif #ifdef GLOBAL_MEM4 tracebackHost(d_matrix, d_globalMaxima, d_globalDirection, d_indexIncrement, d_startingPointsZeroCopy, d_maxPossibleScoreZeroCopy, j*NUMBER_SEQUENCES, kernel_traceback, queue, error_check, d_semaphore); #else tracebackHost(d_matrix, d_globalMaxima, d_globalDirection, d_indexIncrement, d_startingPointsZeroCopy, d_maxPossibleScoreZeroCopy, j*NUMBER_SEQUENCES, kernel_traceback, queue, error_check); #endif #ifdef PROFILING gettimeofday(&endt, NULL); timer_kernel2.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec); #endif // get number of alignments: unsigned int index[1]; #ifdef PROFILING gettimeofday(&startt, NULL); #endif error_check = clEnqueueReadBuffer(queue, d_indexIncrement, CL_TRUE, 0, sizeof(unsigned int), index, 0, NULL, NULL); clFinish(queue); #ifdef PROFILING gettimeofday(&endt, NULL); timer_D2H.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec); #endif if(error_check != CL_SUCCESS) { fprintf(stderr,"Failed to read from d_indexIncrement"); exit(EXIT_FAILURE); } fprintf(stderr, "Number of alignments: %d @ %d in iteration: %d\n", *index, j, iter); // plot the alignments: total_alignments+=index[0]; #ifdef PROFILING gettimeofday(&startt, NULL); #endif /**Reading from zeroCopy Buffer**/ #if defined(NO_ZERO_COPY) || defined(NVIDIA_ZERO_COPY) error_check = clEnqueueReadBuffer(queue, d_globalDirection, CL_TRUE, 0, sizeof(GlobalDirection), h_globalDirectionZeroCopy, 0, NULL, NULL); #endif #ifdef INTEL_ZERO_COPY h_globalDirectionZeroCopy = (GlobalDirection *)clEnqueueMapBuffer(queue, d_globalDirection, CL_TRUE, CL_MAP_READ, 0, sizeof(GlobalDirection), 0, NULL, NULL, &error_check); #endif clFinish(queue); #ifdef PROFILING gettimeofday(&endt, NULL); timer_D2H.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec); #endif if(error_check != CL_SUCCESS) { fprintf(stderr,"Failed to read from d_globalDirectionZeroCopy"); exit(EXIT_FAILURE); } #ifdef PROFILING gettimeofday(&startt, NULL); #endif /**Reading from zeroCopy Buffer**/ #if defined(NO_ZERO_COPY) || defined(NVIDIA_ZERO_COPY) error_check = clEnqueueReadBuffer(queue, d_startingPointsZeroCopy, CL_TRUE, 0, sizeof(StartingPoints), h_startingPointsZeroCopy, 0, NULL, NULL); #endif #ifdef INTEL_ZERO_COPY h_startingPointsZeroCopy = (StartingPoints *)clEnqueueMapBuffer(queue, d_startingPointsZeroCopy, CL_TRUE, CL_MAP_READ, 0, sizeof(StartingPoints), 0, NULL, NULL, &error_check); #endif clFinish(queue); #ifdef PROFILING gettimeofday(&endt, NULL); timer_D2H.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec); #endif if(error_check != CL_SUCCESS) { fprintf(stderr,"Failed to read from d_startingPointsZeroCopy"); exit(EXIT_FAILURE); } #ifdef PROFILING gettimeofday(&startt, NULL); #endif plotAlignments(h_sequences, h_targets, h_globalDirectionZeroCopy, *index, h_startingPointsZeroCopy, j*NUMBER_SEQUENCES, i*NUMBER_TARGETS, descSequences, descTargets); #ifdef PROFILING gettimeofday(&endt, NULL); timer_plotAlignments.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec); #endif } } #ifdef PROFILING gettimeofday(&startt, NULL); #endif /** Unique to NVIDIA Zero-Copy **/ #ifdef NVIDIA_ZERO_COPY clEnqueueUnmapMemObject(queue, pinned_maxPossibleScoreZeroCopy, (void*)h_maxPossibleScoreZeroCopy, 0, NULL, NULL); clEnqueueUnmapMemObject(queue, pinned_startingPointsZeroCopy, (void*)h_startingPointsZeroCopy, 0, NULL, NULL); clEnqueueUnmapMemObject(queue, pinned_globalDirectionZeroCopy, (void*)h_globalDirectionZeroCopy, 0, NULL, NULL); clFinish(queue); error_check = 0; error_check = clReleaseMemObject(d_sequences); error_check |= clReleaseMemObject(d_targets); error_check |= clReleaseMemObject(d_matrix); error_check |= clReleaseMemObject(d_globalMaxima); error_check |= clReleaseMemObject(d_globalDirection); error_check |= clReleaseMemObject(pinned_startingPointsZeroCopy); error_check |= clReleaseMemObject(d_startingPointsZeroCopy); error_check |= clReleaseMemObject(pinned_maxPossibleScoreZeroCopy); error_check |= clReleaseMemObject(d_maxPossibleScoreZeroCopy); error_check |= clReleaseMemObject(pinned_globalDirectionZeroCopy); error_check |= clReleaseMemObject(d_scoringsMatrix); error_check |= clReleaseMemObject(d_indexIncrement); if(error_check!= CL_SUCCESS) { fprintf(stderr, "Releasing memory objects has failed in NVIDIA zero copy\n"); exit(EXIT_FAILURE); } free(h_sequences); free(h_targets); free(h_matrix); free(h_globalMaxima); #endif /** Unique to NO Zero-Copy **/ #ifdef NO_ZERO_COPY error_check = 0; error_check |= clReleaseMemObject(d_sequences); error_check |= clReleaseMemObject(d_targets); error_check |= clReleaseMemObject(d_matrix); error_check |= clReleaseMemObject(d_globalMaxima); error_check |= clReleaseMemObject(d_globalDirection); error_check |= clReleaseMemObject(d_startingPointsZeroCopy); error_check |= clReleaseMemObject(d_maxPossibleScoreZeroCopy); error_check |= clReleaseMemObject(d_scoringsMatrix); error_check |= clReleaseMemObject(d_indexIncrement); if(error_check!= CL_SUCCESS) { fprintf(stderr, "Releasing memory objects has failed in no zero copy\n"); exit(EXIT_FAILURE); } free(h_sequences); free(h_targets); free(h_matrix); free(h_globalMaxima); free(h_globalDirectionZeroCopy); free(h_startingPointsZeroCopy); free(h_maxPossibleScoreZeroCopy); #endif /** Unique to Intel Zero-Copy **/ #ifdef INTEL_ZERO_COPY error_check = 0; error_check |= clReleaseMemObject(d_sequences); error_check |= clReleaseMemObject(d_targets); error_check |= clReleaseMemObject(d_matrix); error_check |= clReleaseMemObject(d_globalMaxima); error_check |= clReleaseMemObject(d_globalDirection); error_check |= clReleaseMemObject(d_startingPointsZeroCopy); error_check |= clReleaseMemObject(d_maxPossibleScoreZeroCopy); error_check |= clReleaseMemObject(d_scoringsMatrix); error_check |= clReleaseMemObject(d_indexIncrement); if(error_check!= CL_SUCCESS) { fprintf(stderr, "Releasing memory objects has failed in INTEL zero copy\n"); exit(EXIT_FAILURE); } free(h_sequences); free(h_targets); free(h_matrix); free(h_globalMaxima); free(startingPointsData); free(maxPossibleScoreData); free(globalDirectionData); #endif #ifdef GLOBAL_MEM4 free(h_semaphore); clReleaseMemObject(d_semaphore); #endif free(descTargets); free(descSequences); error_check = 0; error_check = clReleaseKernel(kernel_calculateScore); error_check |= clReleaseKernel(kernel_traceback); if(error_check!= CL_SUCCESS) { fprintf(stderr, "Could not release kernels\n"); exit(EXIT_FAILURE); } error_check = clReleaseProgram(program); if(error_check!= CL_SUCCESS) { fprintf(stderr, "Could not release program\n"); exit(EXIT_FAILURE); } error_check = clReleaseCommandQueue(queue); if(error_check!= CL_SUCCESS) { fprintf(stderr, "Could not release CommandQueue\n"); exit(EXIT_FAILURE); } error_check = clReleaseContext(context); if(error_check!= CL_SUCCESS) { fprintf(stderr, "Could not release context\n"); exit(EXIT_FAILURE); } #ifdef PROFILING gettimeofday(&endt, NULL); timer_mm.tv_usec += (endt.tv_sec*1000000+endt.tv_usec) - (startt.tv_sec*1000000+startt.tv_usec); #endif #ifdef PROFILING gettimeofday(&endtiter, NULL); timer_iter_total.tv_usec += (endtiter.tv_sec*1000000+endtiter.tv_usec) - (starttiter.tv_sec*1000000+starttiter.tv_usec); #endif #ifdef PROFILING timer_iter_total_array[iter] = timer_iter_total; timer_mm_array[iter] = timer_mm; timer_init_array[iter] = timer_init; timer_H2D_array[iter] = timer_H2D; timer_D2H_array[iter] = timer_D2H; timer_kernel1_array[iter] = timer_kernel1; timer_kernel2_array[iter] = timer_kernel2; timer_plotAlignments_array[iter] = timer_plotAlignments; timer_kernel_builder_array[iter] = timer_kernel_builder; #endif } #ifdef PROFILING gettimeofday(&endttotal, NULL); timer_total.tv_usec += (endttotal.tv_sec*1000000+endttotal.tv_usec) - (startttotal.tv_sec*1000000+startttotal.tv_usec); #endif #ifdef PROFILING if(exists(argv[9])) { FILE *timeFile; timeFile = fopen(argv[9],"a+"); if(timeFile) { for(iter=0; iter<ITERATIONS; iter++) { //fprintf(timeFile, "%d\t", iter); fprintf(timeFile, "%d\t",sequence_index_start); fprintf(timeFile, "%d\t",sequence_index_end); fprintf(timeFile, "%d\t",total_number_sequences); fprintf(timeFile, "%d\t", X); fprintf(timeFile, "%d\t", NUMBER_SEQUENCES); fprintf(timeFile, "%s\t", argv[3]); fprintf(timeFile, "%d\t",target_index_start); fprintf(timeFile, "%d\t",target_index_end); fprintf(timeFile, "%d\t",total_number_targets); fprintf(timeFile, "%d\t", Y); fprintf(timeFile, "%d\t", NUMBER_TARGETS); fprintf(timeFile, "%s\t", argv[4]); fprintf(timeFile, "%ld.%06ld\t",timer_mm_array[iter].tv_usec/1000000, timer_mm_array[iter].tv_usec%1000000); fprintf(timeFile, "%ld.%06ld\t",timer_init_array[iter].tv_usec/1000000, timer_init_array[iter].tv_usec%1000000); fprintf(timeFile, "%ld.%06ld\t",timer_H2D_array[iter].tv_usec/1000000, timer_H2D_array[iter].tv_usec%1000000); fprintf(timeFile, "%ld.%06ld\t",timer_D2H_array[iter].tv_usec/1000000, timer_D2H_array[iter].tv_usec%1000000); fprintf(timeFile, "%ld.%06ld\t",timer_kernel1_array[iter].tv_usec/1000000, timer_kernel1_array[iter].tv_usec%1000000); fprintf(timeFile, "%ld.%06ld\t",timer_kernel2_array[iter].tv_usec/1000000, timer_kernel2_array[iter].tv_usec%1000000); fprintf(timeFile, "%ld.%06ld\t",timer_plotAlignments_array[iter].tv_usec/1000000, timer_plotAlignments_array[iter].tv_usec%1000000); fprintf(timeFile, "%ld.%06ld\t",timer_iter_total_array[iter].tv_usec/1000000, timer_iter_total_array[iter].tv_usec%1000000); fprintf(timeFile, "%d\n",total_alignments); } fprintf(timeFile, "#Total execution time: %ld.%06ld\n",timer_total.tv_usec/1000000, timer_total.tv_usec%1000000); fprintf(timeFile, "\n"); fclose(timeFile); } else{ fprintf(stderr,"Cannot append to file: %s\n", argv[9]); exit(EXIT_FAILURE); } } else { FILE *timeFile; timeFile = fopen(argv[9],"a+"); if(timeFile) { fprintf(timeFile, "#Start_s\tEnd_s\tTotal_s\tX\tDevice_s\tSuperblock_s\tStart_t\tEnd_t\tTotal_t\tY\tDevice_t\tSuperblock_t\tMem_Management\tInit\tH2D\tD2H\tkernel1\tkernel2\tPlotAlignments\titer_total\tAlignments\n"); for(iter=0; iter<ITERATIONS; iter++) { fprintf(timeFile, "%d\t",sequence_index_start); fprintf(timeFile, "%d\t",sequence_index_end); fprintf(timeFile, "%d\t",total_number_sequences); fprintf(timeFile, "%d\t", X); fprintf(timeFile, "%d\t", NUMBER_SEQUENCES); fprintf(timeFile, "%s\t", argv[3]); fprintf(timeFile, "%d\t",target_index_start); fprintf(timeFile, "%d\t",target_index_end); fprintf(timeFile, "%d\t",total_number_targets); fprintf(timeFile, "%d\t", Y); fprintf(timeFile, "%d\t", NUMBER_TARGETS); fprintf(timeFile, "%s\t", argv[4]); fprintf(timeFile, "%ld.%06ld\t",timer_mm_array[iter].tv_usec/1000000, timer_mm_array[iter].tv_usec%1000000); fprintf(timeFile, "%ld.%06ld\t",timer_init_array[iter].tv_usec/1000000, timer_init_array[iter].tv_usec%1000000); fprintf(timeFile, "%ld.%06ld\t",timer_H2D_array[iter].tv_usec/1000000, timer_H2D_array[iter].tv_usec%1000000); fprintf(timeFile, "%ld.%06ld\t",timer_D2H_array[iter].tv_usec/1000000, timer_D2H_array[iter].tv_usec%1000000); fprintf(timeFile, "%ld.%06ld\t",timer_kernel1_array[iter].tv_usec/1000000, timer_kernel1_array[iter].tv_usec%1000000); fprintf(timeFile, "%ld.%06ld\t",timer_kernel2_array[iter].tv_usec/1000000, timer_kernel2_array[iter].tv_usec%1000000); fprintf(timeFile, "%ld.%06ld\t",timer_plotAlignments_array[iter].tv_usec/1000000, timer_plotAlignments_array[iter].tv_usec%1000000); fprintf(timeFile, "%ld.%06ld\t",timer_iter_total_array[iter].tv_usec/1000000, timer_iter_total_array[iter].tv_usec%1000000); fprintf(timeFile, "%d\n",total_alignments); } fprintf(timeFile, "#Total execution time: %ld.%06ld\n",timer_total.tv_usec/1000000, timer_total.tv_usec%1000000); fprintf(timeFile, "\n"); fclose(timeFile); } else{ fprintf(stderr,"Cannot create file: %s\n", argv[9]); exit(EXIT_FAILURE); } } #endif return(0); }
rcl_status cl_create_buffer(struct client_state* state, uint64_t size, buffer_type type, buffer_t* buffer) { int32_t index; uint8_t zero = 0; cl_int error; cl_mem_flags flags = 0; rcl_status status = RCL_OK; if (size == 0) return RCL_INVALID_VALUE; struct buffer_state* buffer_state = malloc(sizeof(struct buffer_state)); if (!buffer_state) return RCL_HOST_RESOURCE; buffer_state->original = NULL; buffer_state->type = type; buffer_state->size = size; log_print(log_notice, "Creating buffer, size: %" PRIu64", type: %s", size, type == BUFFER_READ_ONLY ? "read only" : (type == BUFFER_WRITE_ONLY ? "write only" : "read/write")); switch (type) { case BUFFER_READ_ONLY: flags |= CL_MEM_READ_ONLY; break; case BUFFER_WRITE_ONLY: flags |= CL_MEM_WRITE_ONLY; break; case BUFFER_READ_WRITE: flags |= CL_MEM_READ_WRITE; break; } buffer_state->id = clCreateBuffer(state->context, flags, size, NULL, &error); if (error != CL_SUCCESS) { log_print(log_error, "Error creating buffer: %s", clerror_name(error)); status = opencl_error(error); goto out_state; } if (type == BUFFER_WRITE_ONLY || type == BUFFER_READ_WRITE) { if (cl_feature_check(CL_FEATURE_FILL_BUFFER)) { error = clEnqueueFillBuffer(state->command_queue, buffer_state->id, &zero, sizeof(zero), 0, size, 0, NULL, NULL); } else error = cl_utils_clear_buffer(state, buffer_state->id, size); if (error != CL_SUCCESS) { status = opencl_error(error); goto out_state; } buffer_state->original = malloc(size); if (!buffer_state->original) { status = RCL_HOST_RESOURCE; goto out_mem; } memset(buffer_state->original, 0, size); } index = vector_add(&state->buffers, &buffer_state); if (index < 0) { status = RCL_HOST_RESOURCE; goto out_mem; } *buffer = index + 1; return RCL_OK; out_mem: clReleaseMemObject(buffer_state->id); out_state: if (buffer_state) free(buffer_state->original); free(buffer_state); return status; }
clsparseStatus reduce_by_key( int keys_first, int keys_last, int values_first, cl_mem keys_input, cl_mem values_input, cl_mem keys_output, cl_mem values_output, int *count, clsparseControl control ) { cl_int l_Error; /********************************************************************************** * Compile Options *********************************************************************************/ const int kernel0_WgSize = WAVESIZE*KERNEL02WAVES; const int kernel1_WgSize = WAVESIZE*KERNEL1WAVES; const int kernel2_WgSize = WAVESIZE*KERNEL02WAVES; //const std::string params = std::string() + // " -DKERNEL0WORKGROUPSIZE=" + std::to_string(kernel0_WgSize) // + " -DKERNEL1WORKGROUPSIZE=" + std::to_string(kernel1_WgSize) // + " -DKERNEL2WORKGROUPSIZE=" + std::to_string(kernel2_WgSize); const std::string params; cl::Context context = control->getContext(); std::vector<cl::Device> dev = context.getInfo<CL_CONTEXT_DEVICES>(); int computeUnits = dev[0].getInfo< CL_DEVICE_MAX_COMPUTE_UNITS >( ); int wgPerComputeUnit = dev[0].getInfo< CL_DEVICE_MAX_WORK_GROUP_SIZE >( ); int resultCnt = computeUnits * wgPerComputeUnit; cl_uint numElements = keys_last - keys_first + 1; size_t sizeInputBuff = numElements; int modWgSize = (sizeInputBuff & (kernel0_WgSize-1)); if( modWgSize ) { sizeInputBuff &= ~modWgSize; sizeInputBuff += kernel0_WgSize; } cl_uint numWorkGroupsK0 = static_cast< cl_uint >( sizeInputBuff / kernel0_WgSize ); size_t sizeScanBuff = numWorkGroupsK0; modWgSize = (sizeScanBuff & (kernel0_WgSize-1)); if( modWgSize ) { sizeScanBuff &= ~modWgSize; sizeScanBuff += kernel0_WgSize; } cl_mem tempArrayVec = clCreateBuffer(context(),CL_MEM_READ_WRITE, (numElements)*sizeof(int), NULL, NULL ); /********************************************************************************** * Kernel 0 *********************************************************************************/ cl::Kernel kernel0 = KernelCache::get(control->queue,"reduce_by_key", "OffsetCalculation", params); KernelWrap kWrapper0(kernel0); kWrapper0 << keys_input << tempArrayVec << numElements; cl::NDRange local0(kernel0_WgSize); cl::NDRange global0(sizeInputBuff); cl_int status = kWrapper0.run(control, global0, local0); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } int init = 0; scan(0, numElements - 1, tempArrayVec, tempArrayVec, 0, 0, control ); int pattern = 0; cl_mem keySumArray = clCreateBuffer(context(),CL_MEM_READ_WRITE, (sizeScanBuff)*sizeof(int), NULL, NULL ); cl_mem preSumArray = clCreateBuffer(context(),CL_MEM_READ_WRITE, (sizeScanBuff)*sizeof(int), NULL, NULL ); cl_mem postSumArray = clCreateBuffer(context(),CL_MEM_READ_WRITE,(sizeScanBuff)*sizeof(int), NULL, NULL ); clEnqueueFillBuffer(control->queue(), keySumArray, &pattern, sizeof(int), 0, (sizeScanBuff)*sizeof(int), 0, NULL, NULL); clEnqueueFillBuffer(control->queue(), preSumArray, &pattern, sizeof(int), 0, (sizeScanBuff)*sizeof(int), 0, NULL, NULL); clEnqueueFillBuffer(control->queue(), postSumArray, &pattern, sizeof(int), 0, (sizeScanBuff)*sizeof(int), 0, NULL, NULL); /********************************************************************************** * Kernel 1 *********************************************************************************/ cl::Kernel kernel1 = KernelCache::get(control->queue,"reduce_by_key", "perBlockScanByKey", params); KernelWrap kWrapper1(kernel1); kWrapper1 << tempArrayVec << values_input << numElements << keySumArray << preSumArray; cl::NDRange local1(kernel0_WgSize); cl::NDRange global1(sizeInputBuff); status = kWrapper1.run(control, global1, local1); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } /********************************************************************************** * Kernel 2 *********************************************************************************/ cl_uint workPerThread = static_cast< cl_uint >( sizeScanBuff / kernel1_WgSize ); cl::Kernel kernel2 = KernelCache::get(control->queue,"reduce_by_key", "intraBlockInclusiveScanByKey", params); KernelWrap kWrapper2(kernel2); kWrapper2 << keySumArray << preSumArray << postSumArray << numWorkGroupsK0 << workPerThread; cl::NDRange local2(kernel1_WgSize); cl::NDRange global2(kernel1_WgSize); status = kWrapper2.run(control, global2, local2); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } /********************************************************************************** * Kernel 3 *********************************************************************************/ cl::Kernel kernel3 = KernelCache::get(control->queue,"reduce_by_key", "keyValueMapping", params); KernelWrap kWrapper3(kernel3); kWrapper3 << keys_input << keys_output << values_input << values_output << tempArrayVec << keySumArray << postSumArray << numElements; cl::NDRange local3(kernel0_WgSize); cl::NDRange global3(sizeInputBuff); status = kWrapper3.run(control, global3, local3); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } int *h_result = (int *) malloc (sizeof(int)); clEnqueueReadBuffer(control->queue(), tempArrayVec, 1, (numElements-1)*sizeof(int), sizeof(int), h_result, 0, 0, 0); *count = *(h_result); //printf("h_result = %d\n", *count ); //release buffers clReleaseMemObject(tempArrayVec); clReleaseMemObject(preSumArray); clReleaseMemObject(postSumArray); clReleaseMemObject(keySumArray); return clsparseSuccess; } //end of reduce_by_key
int main(int argc, char *argv[]) { // selected platform and device number cl_uint pn = 0, dn = 0; // OpenCL error cl_int error; // generic iterator cl_uint i; // major/minor version of the platform OpenCL version cl_uint ocl_major, ocl_minor; // set platform/device num from command line if (argc > 1) pn = atoi(argv[1]); if (argc > 2) dn = atoi(argv[2]); error = clGetPlatformIDs(0, NULL, &np); CHECK_ERROR("getting amount of platform IDs"); printf("%u platforms found\n", np); if (pn >= np) { fprintf(stderr, "there is no platform #%u\n" , pn); exit(1); } // only allocate for IDs up to the intended one platform = calloc(pn+1,sizeof(*platform)); // if allocation failed, next call will bomb. rely on this error = clGetPlatformIDs(pn+1, platform, NULL); CHECK_ERROR("getting platform IDs"); // choose platform p = platform[pn]; error = clGetPlatformInfo(p, CL_PLATFORM_NAME, BUFSZ, strbuf, NULL); CHECK_ERROR("getting platform name"); printf("using platform %u: %s\n", pn, strbuf); error = clGetPlatformInfo(p, CL_PLATFORM_VERSION, BUFSZ, strbuf, NULL); CHECK_ERROR("getting platform version"); // we need 1.2 at least i = sscanf(strbuf, "OpenCL %u.%u ", &ocl_major, &ocl_minor); if (i != 2) { fprintf(stderr, "%s:%u: unable to determine platform OpenCL version\n", __func__, __LINE__); exit(1); } if (ocl_major == 1 && ocl_minor < 2) { fprintf(stderr, "%s:%u: Platform version %s is not at least 1.2\n", __func__, __LINE__, strbuf); exit(1); } error = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, 0, NULL, &nd); CHECK_ERROR("getting amount of device IDs"); printf("%u devices found\n", nd); if (dn >= nd) { fprintf(stderr, "there is no device #%u\n", dn); exit(1); } // only allocate for IDs up to the intended one device = calloc(dn+1,sizeof(*device)); // if allocation failed, next call will bomb. rely on this error = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, dn+1, device, NULL); CHECK_ERROR("getting device IDs"); // choose device d = device[dn]; error = clGetDeviceInfo(d, CL_DEVICE_NAME, BUFSZ, strbuf, NULL); CHECK_ERROR("getting device name"); printf("using device %u: %s\n", dn, strbuf); error = clGetDeviceInfo(d, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(gmem), &gmem, NULL); CHECK_ERROR("getting device global memory size"); error = clGetDeviceInfo(d, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(alloc_max), &alloc_max, NULL); CHECK_ERROR("getting device max memory allocation size"); // create context ctx_prop[1] = (cl_context_properties)p; ctx = clCreateContext(ctx_prop, 1, &d, NULL, NULL, &error); CHECK_ERROR("creating context"); // create queue q = clCreateCommandQueue(ctx, d, CL_QUEUE_PROFILING_ENABLE, &error); CHECK_ERROR("creating queue"); // create program pg = clCreateProgramWithSource(ctx, sizeof(src)/sizeof(*src), src, NULL, &error); CHECK_ERROR("creating program"); // build program error = clBuildProgram(pg, 1, &d, NULL, NULL, NULL); CHECK_ERROR("building program"); // get kernel k = clCreateKernel(pg, "add", &error); CHECK_ERROR("creating kernel"); error = clGetKernelWorkGroupInfo(k, d, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(wgm), &wgm, NULL); CHECK_ERROR("getting preferred workgroup size multiple"); // number of elements on which kernel will be launched. it's ok if we don't // cover every byte of the buffers nels = alloc_max/sizeof(cl_float); gws = ROUND_MUL(nels, wgm); printf("will use %zu workitems grouped by %zu to process %u elements\n", gws, wgm, nels); // we will try and allocate at least one buffer more than needed to fill // the device memory, and no less than 3 anyway nbuf = gmem/alloc_max + 1; if (nbuf < 3) nbuf = 3; #define MB (1024*1024.0) printf("will try allocating %u host buffers of %gMB each to overcommit %gMB\n", nbuf, alloc_max/MB, gmem/MB); hostbuf = calloc(nbuf, sizeof(cl_mem)); if (!hostbuf) { fprintf(stderr, "could not prepare support for %u buffers\n", nbuf); exit(1); } // allocate ‘host’ buffers for (i = 0; i < nbuf; ++i) { hostbuf[i] = clCreateBuffer(ctx, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY, alloc_max, NULL, &error); CHECK_ERROR("allocating host buffer"); printf("host buffer %u allocated\n", i); error = clEnqueueMigrateMemObjects(q, 1, hostbuf + i, CL_MIGRATE_MEM_OBJECT_HOST | CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED, 0, NULL, NULL); CHECK_ERROR("migrating buffer to host"); printf("buffer %u migrated to host\n", i); } // allocate ‘device’ buffers for (i = 0; i < 2; ++i) { devbuf[i] = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, alloc_max, NULL, &error); CHECK_ERROR("allocating devbuffer"); printf("dev buffer %u allocated\n", i); if (i == 0) { float patt = 0; error = clEnqueueFillBuffer(q, devbuf[0], &patt, sizeof(patt), 0, nels*sizeof(patt), 0, NULL, &mem_evt); CHECK_ERROR("enqueueing memset"); } } error = clWaitForEvents(1, &mem_evt); CHECK_ERROR("waiting for buffer fill"); clReleaseEvent(mem_evt); mem_evt = NULL; // use the buffers for (i = 0; i < nbuf; ++i) { printf("testing buffer %u\n", i); // for each buffer, we do a setup on CPU and then use it as second // argument for the kernel hbuf = clEnqueueMapBuffer(q, hostbuf[i], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, alloc_max, 0, NULL, NULL, &error); CHECK_ERROR("mapping buffer"); for (e = 0; e < nels; ++e) hbuf[e] = i; error = clEnqueueUnmapMemObject(q, hostbuf[i], hbuf, 0, NULL, NULL); CHECK_ERROR("unmapping buffer"); hbuf = NULL; // copy ‘host’ to ‘device’ buffer clEnqueueCopyBuffer(q, hostbuf[i], devbuf[1], 0, 0, alloc_max, 0, NULL, NULL); // make sure all pending actions are completed error = clFinish(q); CHECK_ERROR("settling down"); clSetKernelArg(k, 0, sizeof(cl_mem), devbuf); clSetKernelArg(k, 1, sizeof(cl_mem), devbuf + 1); clSetKernelArg(k, 2, sizeof(nels), &nels); error = clEnqueueNDRangeKernel(q, k, 1, NULL, &gws, &wgm, 0, NULL, &krn_evt); CHECK_ERROR("enqueueing kernel"); error = clEnqueueCopyBuffer(q, devbuf[0], hostbuf[0], 0, 0, alloc_max, 1, &krn_evt, &mem_evt); CHECK_ERROR("copying data to host"); expected = i*(i+1)/2.0f; hbuf = clEnqueueMapBuffer(q, hostbuf[0], CL_TRUE, CL_MAP_READ, 0, alloc_max, 1, &mem_evt, NULL, &error); CHECK_ERROR("mapping buffer 0"); for (e = 0; e < nels; ++e) if (hbuf[e] != expected) { fprintf(stderr, "mismatch @ %u: %g instead of %g\n", e, hbuf[e], expected); exit(1); } error = clEnqueueUnmapMemObject(q, hostbuf[0], hbuf, 0, NULL, NULL); CHECK_ERROR("unmapping buffer 0"); hbuf = NULL; clReleaseEvent(krn_evt); clReleaseEvent(mem_evt); krn_evt = mem_evt = NULL; } for (i = 1; i <= 2; ++i) { clReleaseMemObject(devbuf[2 - i]); printf("dev buffer %u freed\n", nbuf - i); } for (i = 1; i <= nbuf; ++i) { clReleaseMemObject(hostbuf[nbuf - i]); printf("host buffer %u freed\n", nbuf - i); } return 0; }