void simulation_step() { cl_event ev_kernel; cl_int err; size_t glb_sz[1] = { N }; err = clEnqueueNDRangeKernel(ocl.queue, ocl.kernel_step, 1, NULL, glb_sz, NULL, 0, NULL, &ev_kernel); CLU_ERRCHECK(err, "Failed to enqueue kernel"); // read result... err = clEnqueueReadBuffer(ocl.queue, ocl.mem_bodies, CL_TRUE, 0, N * sizeof(body), &B, 0, NULL, NULL); CLU_ERRCHECK(err, "Failed to read result"); // and evaluate time totalTime += cluGetDurationMS(ev_kernel); steps++; }
void cleanup_ocl() { cl_int err; // finalization err = clFinish(ocl.queue); err |= clReleaseKernel(ocl.kernel_step); err |= clReleaseProgram(ocl.prog); err |= clReleaseMemObject(ocl.mem_bodies); err |= clReleaseCommandQueue(ocl.queue); err |= clReleaseContext(ocl.ctx); CLU_ERRCHECK(err, "Failed during ocl cleanup"); }
double getDurationMS(cl_event event) { cl_ulong start = 0; cl_ulong end = 0; cl_int ret; ret = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), (void *)&start, NULL); ret |= clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), (void *)&end, NULL); CLU_ERRCHECK(ret, "Unable to read profiling information!"); // the values are measured in nano seconds! return (double)(end - start) / 1000000.0; }
// ----------- opencl ------------- void init_ocl() { cl_int err; ocl.id = cluInitDevice(CL_DEVICE, &ocl.ctx, &ocl.queue); printf("OCL Device: %s\n", cluGetDeviceDescription(ocl.id, CL_DEVICE)); // create kernel from source ocl.prog = cluBuildProgramFromFile(ocl.ctx, ocl.id, KERNEL_FILE_NAME, NULL); ocl.kernel_step = clCreateKernel(ocl.prog, "simulation_step", &err); CLU_ERRCHECK(err, "Failed to create 'simulation_step' kernel from program"); // create memory buffer ocl.mem_bodies = clCreateBuffer(ocl.ctx, CL_MEM_READ_WRITE, N * sizeof(body), NULL, &err); CLU_ERRCHECK(err, "Failed to create memory buffer"); // fill memory buffer err = clEnqueueWriteBuffer(ocl.queue, ocl.mem_bodies, CL_FALSE, 0, N * sizeof(body), B, 0, NULL, NULL); CLU_ERRCHECK(err, "Failed to write data to device"); // set arguments cluSetKernelArguments(ocl.kernel_step, 1, sizeof(cl_mem), (void *)&ocl.mem_bodies); }
int main() { // unsigned long start_time = time_ms(); // init matrix memset(u, 0, N*N*sizeof(VALUE)); printf("Jacobi with N=%d, L_SZ=%d, IT=%d\n", N, L_SZ, IT); printf("Kernel file name: %s\n", KERNEL_FILE_NAME); // init F for(int i=0; i<N; i++) for(int j=0; j<N; j++) f[i][j] = init_func(i, j); VALUE factor = pow((VALUE)1/N, 2); // ocl initialization cl_context context; cl_command_queue command_queue; cl_device_id device_id = cluInitDevice(CL_DEVICE, &context, &command_queue); // create memory buffers cl_int err; cl_mem matrix_U = clCreateBuffer(context, CL_MEM_READ_WRITE, N * N * sizeof(VALUE), NULL, &err); cl_mem matrix_F = clCreateBuffer(context, CL_MEM_READ_ONLY, N * N * sizeof(VALUE), NULL, &err); cl_mem matrix_TMP = clCreateBuffer(context, CL_MEM_READ_WRITE, N * N * sizeof(VALUE), NULL, &err); CLU_ERRCHECK(err, "Failed to create buffer for matrix"); // used for profiling info cl_event ev_write_U; cl_event ev_write_F; cl_event ev_kernel; cl_event ev_read_TMP; double write_total, read_total, kernel_total; write_total = read_total = kernel_total = 0.0; // create kernel from source char tmp[1024]; sprintf(tmp, "-DN=%i -DVALUE=%s", N, EXPAND_AND_QUOTE(VALUE)); cl_program program = cluBuildProgramFromFile(context, device_id, KERNEL_FILE_NAME, tmp); cl_kernel kernel = clCreateKernel(program, "jacobi", &err); CLU_ERRCHECK(err, "Failed to create matrix_mul kernel from program"); /* ---------------------------- main part ----------------------------------- */ // also initialize target matrix with zero values!!! err = clEnqueueWriteBuffer(command_queue, matrix_TMP, CL_TRUE, 0, N * N * sizeof(VALUE), u, 0, NULL, &ev_write_U); CLU_ERRCHECK(err, "Failed to write matrix to device"); // write f to device err = clEnqueueWriteBuffer(command_queue, matrix_F, CL_FALSE, 0, N * N * sizeof(VALUE), f, 0, NULL, &ev_write_F); CLU_ERRCHECK(err, "Failed to write matrix F to device"); // write matrix u to device err = clEnqueueWriteBuffer(command_queue, matrix_U, CL_FALSE, 0, N * N * sizeof(VALUE), u, 0, NULL, &ev_write_U); CLU_ERRCHECK(err, "Failed to write matrix U to device"); // define global work size size_t g_work_size[2] = {N, N}; size_t l_work_size[2] = {L_SZ, L_SZ}; cl_mem buffer_u; cl_mem buffer_tmp; for (int i = 0; i < IT; ++i) { // swap U and TMP arguments based on iteration counter if(i % 2 == 0) { buffer_u = matrix_U; buffer_tmp = matrix_TMP; } else { buffer_u = matrix_TMP; buffer_tmp = matrix_U; } // compute memory block dimensions int block_dim = (L_SZ + 2) * (L_SZ + 2); // set kernel arguments cluSetKernelArguments(kernel, 5, sizeof(cl_mem), (void *)&buffer_u, sizeof(cl_mem), (void *)&matrix_F, sizeof(cl_mem), (void *)&buffer_tmp, // local memory buffer block_dim * sizeof(VALUE), NULL, sizeof(VALUE), (void *)&factor); // execute kernel err = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, g_work_size, l_work_size, 0, NULL, &ev_kernel); CLU_ERRCHECK(err, "Failed to enqueue 2D kernel"); // wait until execution completes clWaitForEvents(1, &ev_kernel); // add profiling information kernel_total += getDurationMS(ev_kernel); } // copy results back to host err = clEnqueueReadBuffer(command_queue, buffer_tmp, CL_TRUE, 0, N * N * sizeof(VALUE), u, 0, NULL, &ev_read_TMP); CLU_ERRCHECK(err, "Failed reading back result"); // compute profiling information write_total += getDurationMS(ev_write_U); write_total += getDurationMS(ev_write_F); read_total += getDurationMS(ev_read_TMP); /* ---------------------------- evaluate results ---------------------------------- */ // print result printf("OCL Device: %s\n", cluGetDeviceDescription(device_id, CL_DEVICE)); // printf("Verification: %4s\n", (success) ? "OK" : "ERR"); printf("Write total: %9.4f ms\n", write_total); printf("Read total: %9.4f ms\n", read_total); printf("Kernel execution: %9.4f ms\n", kernel_total); printf("Time total: %9.4f ms\n\n", write_total + read_total + kernel_total); #ifdef DEBUG print_result(u); #endif /* ---------------------------- finalization ------------------------------------- */ err = clFinish(command_queue); err |= clReleaseKernel(kernel); err |= clReleaseProgram(program); err |= clReleaseMemObject(matrix_U); err |= clReleaseMemObject(matrix_F); err |= clReleaseMemObject(matrix_TMP); err |= clReleaseCommandQueue(command_queue); err |= clReleaseContext(context); CLU_ERRCHECK(err, "Failed during ocl cleanup"); return EXIT_SUCCESS; }
int main(int argc, char** argv){ srand(time(NULL)); if(argc != 2) { printf("Usage: search [elements]\nExample: scan 10000\n"); return -1; } unsigned long long start_time = time_ms(); int event_amount=2; int elems = atoi(argv[1]); cl_int err; cl_event* events=allocateMemoryForEvent(event_amount); cl_ulong total_downsweep=0,total_hillissteele=0; size_t localWorkGroupSize_downSweep[1]={LOCALSIZE}; //must be power of two size_t globalWorkGroupSize_downSweep[1]={getPowerOfTwo(roundUp(LOCALSIZE,roundUp(LOCALSIZE, elems)/2))}; //calculating size_t localWorkGroupSize_hillissteele[1]={LOCALSIZE}; //must be power of two size_t globalWorkGroupSize_hillissteele[1]={roundUp(LOCALSIZE,elems)}; //calculating worksize int howManyWorkGroups=globalWorkGroupSize_downSweep[0]/LOCALSIZE; //quotient is power of two, since dividend and divisor are power of two int sumBuffer_length_downSweep=howManyWorkGroups; int sumBuffer_length_hillis=getPowerOfTwo(roundUp(LOCALSIZE,elems)/LOCALSIZE); VALUE *data = (VALUE*)malloc(elems*sizeof(VALUE)); VALUE *result_seq=(VALUE*)malloc(elems*sizeof(VALUE)); VALUE *result=(VALUE*)malloc(elems*sizeof(VALUE)); VALUE *result_hillissteele=(VALUE*)malloc(elems*sizeof(VALUE)); VALUE *sum=(VALUE*)malloc(sumBuffer_length_downSweep*sizeof(VALUE)); VALUE *sum_hillis=(VALUE*)malloc(sumBuffer_length_hillis*sizeof(VALUE)); memset(sum_hillis,0,sumBuffer_length_hillis*sizeof(VALUE)); memset(result_seq,0,elems*sizeof(VALUE)); // initialize data set (fill randomly) for(int j=0; j<elems; ++j) { data[j] =rand()%121; } // printResult(data, elems, 4, "INPUT"); /*Sequential Scan*/ for(int i=1; i<elems; i++){ result_seq[i]=result_seq[i-1]+data[i-1]; } // printResult(result_seq, elems, 4, "Sequential Algorithm OUTPUT"); //ocl initialization size_t deviceInfo; cl_context context; cl_command_queue command_queue; cl_device_id device_id = cluInitDevice(CL_DEVICE, &context, &command_queue); clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(size_t), &deviceInfo,NULL ); // create memory buffer cl_mem mem_data=clCreateBuffer(context, CL_MEM_READ_ONLY| CL_MEM_USE_HOST_PTR,elems*sizeof(VALUE), data, &err); cl_mem mem_data_hillis=clCreateBuffer(context, CL_MEM_READ_ONLY| CL_MEM_USE_HOST_PTR,elems*sizeof(VALUE), data, &err); cl_mem mem_result=clCreateBuffer(context, CL_MEM_READ_WRITE, elems*sizeof(VALUE), NULL,&err); cl_mem mem_result_tmp=clCreateBuffer(context, CL_MEM_READ_WRITE, elems*sizeof(VALUE), NULL,&err); cl_mem mem_sum=clCreateBuffer(context, CL_MEM_READ_WRITE, sumBuffer_length_downSweep*sizeof(VALUE), NULL, &err); cl_mem mem_sum_hillis=clCreateBuffer(context, CL_MEM_READ_WRITE, sumBuffer_length_hillis*sizeof(VALUE), NULL, &err); CLU_ERRCHECK(err, "Failed to create Buffer"); err=clEnqueueWriteBuffer(command_queue, mem_sum_hillis, CL_TRUE, 0, sumBuffer_length_hillis*sizeof(VALUE), sum_hillis, 0, NULL, NULL); CLU_ERRCHECK(err, "Failed to write values into mem_sum"); // create kernel from source char tmp[1024]; sprintf(tmp,"-DVALUE=%s", EXPAND_AND_QUOTE(VALUE)); cl_program program = cluBuildProgramFromFile(context, device_id, KERNEL_FILE_NAME, tmp); cl_kernel kernel_downSweep = clCreateKernel(program, "prefix_scan_downSweep", &err); cl_kernel kernel_hillissteele=clCreateKernel(program, "prefix_scan_hillissteele", &err); cl_kernel kernel_last_stage= clCreateKernel(program, "prefix_scan_last_stage", &err); CLU_ERRCHECK(err,"Could not load source program"); /*-------------------------------------DOWNSWEEP-----------------------------------------------*/ // set arguments int border=elems/2; int flag=1; cluSetKernelArguments(kernel_downSweep, 6, sizeof(cl_mem), (void *)&mem_data, sizeof(cl_mem), (void*)&mem_result, sizeof(cl_mem), (void*)&mem_sum,sizeof(VALUE)*LOCALSIZE*2, NULL, sizeof(int), (void*)&border, sizeof(int), (void*)&flag); //execute kernel CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_downSweep, 1, NULL, globalWorkGroupSize_downSweep, localWorkGroupSize_downSweep, 0, NULL, &(events[1])), "DownSweep_Failed to enqueue 2D kernel"); clFinish(command_queue); total_downsweep+=getProfileTotalTime(events,1); //read values back from device CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_result, CL_TRUE, 0, elems*sizeof(VALUE), result, 0, NULL, NULL),"DownSweep_Failed to read Result Values"); /* CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_sum, CL_TRUE, 0, sumBuffer_length_downSweep*sizeof(VALUE), sum, 0, NULL, NULL),"Failed to read Sum Values"); clFinish(command_queue); printSumBuffer(sum, sumBuffer_length_downSweep,"DOWNSWEEP SUM"); */ err=clEnqueueCopyBuffer(command_queue, mem_result, mem_result_tmp, 0, 0, elems*sizeof(VALUE),0,NULL,NULL); CLU_ERRCHECK(err,"DownSweep_Failed during copying buffer"); /*+++++++++++++++++++++++++++++++++DOWNSWEEP-ON-SUM-BUFFER+++++++++++++++++++++++++++++++++++++++*/ flag=0; border=sumBuffer_length_downSweep/2; //since sumbuffer_length is power of two no further adaption is needed cluSetKernelArguments(kernel_downSweep, 6, sizeof(cl_mem), (void *)&mem_sum, sizeof(cl_mem), (void*)&mem_sum, sizeof(cl_mem), (void*)&mem_sum,sizeof(VALUE)*sumBuffer_length_downSweep, NULL, sizeof(int), (void*)&border, sizeof(int), (void*)&flag); howManyWorkGroups>1 ? globalWorkGroupSize_downSweep[0]=howManyWorkGroups/2:howManyWorkGroups; //if 1 workgroup make adaption howManyWorkGroups>1 ? localWorkGroupSize_downSweep[0]=howManyWorkGroups/2:howManyWorkGroups; //if 1 workgroup make adaption //execute kernel CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_downSweep, 1, NULL, globalWorkGroupSize_downSweep, localWorkGroupSize_downSweep, 0, NULL,&(events[1])), "DownSweep_Failed to enqueue 2D kernel"); clFinish(command_queue); total_downsweep+=getProfileTotalTime(events,1); /* CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_sum, CL_TRUE, 0, sumBuffer_length_downSweep*sizeof(VALUE), sum, 0, NULL, NULL),"Failed to read Sum Values"); printSumBuffer(sum, sumBuffer_length_downSweep,"DOWNSWEEP SUM PREFIX"); */ /*+++++++++++++++++++++++++++++++++DOWNSWEEP-LAST-STAGE(Add Sums)++++++++++++++++++++++++++++++++++++++++*/ border=sumBuffer_length_downSweep; flag=1; cluSetKernelArguments(kernel_last_stage, 4, sizeof(cl_mem), (void *)&mem_result_tmp, sizeof(cl_mem), (void*)&mem_sum, sizeof(int), (void*)&border, sizeof(int), (void*)&flag); globalWorkGroupSize_downSweep[0]=getPowerOfTwo(roundUp(LOCALSIZE,roundUp(LOCALSIZE, elems)/2)); localWorkGroupSize_downSweep[0]=LOCALSIZE; //printf("GLOBALSIZE: %d\tLOCALSIZE %d\n",globalWorkGroupSize[0],localWorkGroupSize[0]); //execute kernel CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_last_stage, 1, NULL, globalWorkGroupSize_downSweep, localWorkGroupSize_downSweep, 0, NULL, &(events[1])), "DownSweep_Failed to enqueue 2D kernel"); clFinish(command_queue); total_downsweep+=getProfileTotalTime(events,1); //read values back from device CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_result_tmp, CL_TRUE, 0, elems*sizeof(VALUE), result, 0, NULL, NULL),"DownSweep_Failed to read Result Values"); /*---------------------------------------HILLISSTEELE----------------------------------------------------------*/ flag=1; border=elems; cluSetKernelArguments(kernel_hillissteele, 6, sizeof(cl_mem), (void *)&mem_data_hillis, sizeof(cl_mem), (void*)&mem_result, sizeof(cl_mem), (void*)&mem_sum_hillis,sizeof(VALUE)*LOCALSIZE*2, NULL, sizeof(int), (void*)&border, sizeof(int), (void*)&flag); //execute kernel //printf("GlobalSize: %d\tLocalWorkGroupSize: %d\n",globalWorkGroupSize[0], localWorkGroupSize[0]); //printf("Amount of WorkGroups: %d\n", globalWorkGroupSize[0]/localWorkGroupSize[0]); CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_hillissteele, 1, NULL, globalWorkGroupSize_hillissteele, localWorkGroupSize_hillissteele, 0, NULL, &(events[0])), "Hillissteele_Failed to enqueue 2D kernel_Inputbuffer"); clFinish(command_queue); total_hillissteele+=getProfileTotalTime(events,0); //read values back from device /* CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_result, CL_TRUE, 0, elems*sizeof(VALUE), result_hillissteele, 0, NULL, NULL),"Failed to read Result Values"); CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_sum_hillis, CL_TRUE, 0, sumBuffer_length_hillis*sizeof(VALUE), sum_hillis, 0, NULL, NULL),"Failed to read Sum_1 Values"); printSumBuffer(sum_hillis, sumBuffer_length_hillis, "HILLISSTEELE SUM"); printResult(result_hillissteele,elems, 4, "HILLISSTEELE Temporary OUTPUT"); */ /*++++++++++++++++++++++++++++++++++++++HILLISSTEELE-ON-SUM-BUFFER+++++++++++++++++++++++++++++++++++++*/ flag=0; border=sumBuffer_length_hillis; cluSetKernelArguments(kernel_hillissteele, 6, sizeof(cl_mem), (void *)&mem_sum_hillis, sizeof(cl_mem), (void*)&mem_sum_hillis, sizeof(cl_mem), (void*)&mem_sum_hillis,sizeof(VALUE)*howManyWorkGroups*2, NULL, sizeof(int), (void*)&border, sizeof(int), (void*)&flag); //execute kernel globalWorkGroupSize_hillissteele[0]=sumBuffer_length_hillis; localWorkGroupSize_hillissteele[0]=sumBuffer_length_hillis; CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_hillissteele, 1, NULL, globalWorkGroupSize_hillissteele, localWorkGroupSize_hillissteele, 0, NULL, &(events[0])), "Hillissteele_Failed to enqueue 2D kernel_Sumbuffer"); clFinish(command_queue); total_hillissteele+=getProfileTotalTime(events,0); //read values back from device /* CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_sum_hillis, CL_TRUE, 0, sumBuffer_length_hillis*sizeof(VALUE), sum_hillis, 0, NULL, NULL),"Failed to read Sum2 Values"); printSumBuffer(sum_hillis, sumBuffer_length_hillis, "HILLISSTEELE SUM PREFIX"); */ /*+++++++++++++++++++++++++++++++++++++HILLISSTEELE-LAST-STAGE(Add Sums)++++++++++++++++++++++++++++++++++++++++*/ flag=0; border=sumBuffer_length_hillis; cluSetKernelArguments(kernel_last_stage, 4, sizeof(cl_mem), (void *)&mem_result, sizeof(cl_mem), (void*)&mem_sum_hillis, sizeof(int), (void*)&border, sizeof(int), (void*)&flag); globalWorkGroupSize_hillissteele[0]=roundUp(LOCALSIZE,elems); localWorkGroupSize_hillissteele[0]=LOCALSIZE; //printf("GLOBALSIZE: %d\tLOCALSIZE %d\n",globalWorkGroupSize[0],localWorkGroupSize[0]); //execute kernel CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_last_stage, 1, NULL, globalWorkGroupSize_hillissteele, localWorkGroupSize_hillissteele, 0, NULL, &(events[0])), "Hillissteele_Failed to enqueue kernel_Last_stage"); clFinish(command_queue); total_hillissteele+=getProfileTotalTime(events,0); //read values back from device CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_result, CL_TRUE, 0, elems*sizeof(VALUE), result_hillissteele, 0, NULL, NULL),"Hillissteele_Failed to read Result Values"); /*-------------------------FINISHED---------------------------------------------*/ //printResult(result_hillissteele, elems, 4, "HILLISSTEELE OUTPUT"); //printResult(result, elems, 4, "IMPROVED IMPLEMENTATION OUTPUT"); //verify results verifyResult(result_seq,result,elems, "Verifying result of DownSweep for bigger array size"); verifyResult(result_seq,result_hillissteele,elems, "Verifying result of HILLISSTEELE for bigger array size"); printProfileInfo(total_downsweep,"Improved Algorithm Time:"); printProfileInfo(total_hillissteele,"Hillis & Steele Time:"); printf("\nDEVICE INFO MAX_WORK_GROUP_SIZE: %d\n", (int) deviceInfo); printf("OCL Device: %s\n", cluGetDeviceDescription(device_id, CL_DEVICE)); printf("Done, took %16llu ms\n", time_ms()-start_time); // finalization for(int i=0; i<event_amount; i++){ clReleaseEvent(events[i]); } err = clFinish(command_queue); err |= clReleaseKernel(kernel_downSweep); err |= clReleaseKernel(kernel_last_stage); err |= clReleaseKernel(kernel_hillissteele); err |= clReleaseProgram(program); err |= clReleaseMemObject(mem_data); err |= clReleaseMemObject(mem_data_hillis); err |= clReleaseMemObject(mem_result); err |= clReleaseMemObject(mem_result_tmp); err |= clReleaseMemObject(mem_sum); err |= clReleaseMemObject(mem_sum_hillis); err |= clReleaseCommandQueue(command_queue); err |= clReleaseContext(context); CLU_ERRCHECK(err, "Failed during ocl cleanup"); free(events); free(result); free(result_hillissteele); free(result_seq); free(sum); free(sum_hillis); return EXIT_SUCCESS; }