static void signal_handler(int sig) { char *name; switch (sig) { case SIGILL: name = "SIGILL"; break; case SIGABRT: name = "SIGABRT"; break; case SIGBUS: name = "SIGBUS"; break; case SIGSEGV: name = "SIGSEGV"; break; default: name = "??"; } fprintf(stdout, "received signal %s\n", name); printStackTrace(); printProfileInfo(); signal(SIGILL, NULL); signal(SIGABRT, NULL); signal(SIGBUS, NULL); signal(SIGSEGV, NULL); abort(); }
int main(int argc, char** argv){ srand(time(NULL)); if(argc != 2) { printf("Usage: search [elements]\nExample: scan 10000\n"); return -1; } unsigned long long start_time = time_ms(); int event_amount=2; int elems = atoi(argv[1]); cl_int err; cl_event* events=allocateMemoryForEvent(event_amount); cl_ulong total_downsweep=0,total_hillissteele=0; size_t localWorkGroupSize_downSweep[1]={LOCALSIZE}; //must be power of two size_t globalWorkGroupSize_downSweep[1]={getPowerOfTwo(roundUp(LOCALSIZE,roundUp(LOCALSIZE, elems)/2))}; //calculating size_t localWorkGroupSize_hillissteele[1]={LOCALSIZE}; //must be power of two size_t globalWorkGroupSize_hillissteele[1]={roundUp(LOCALSIZE,elems)}; //calculating worksize int howManyWorkGroups=globalWorkGroupSize_downSweep[0]/LOCALSIZE; //quotient is power of two, since dividend and divisor are power of two int sumBuffer_length_downSweep=howManyWorkGroups; int sumBuffer_length_hillis=getPowerOfTwo(roundUp(LOCALSIZE,elems)/LOCALSIZE); VALUE *data = (VALUE*)malloc(elems*sizeof(VALUE)); VALUE *result_seq=(VALUE*)malloc(elems*sizeof(VALUE)); VALUE *result=(VALUE*)malloc(elems*sizeof(VALUE)); VALUE *result_hillissteele=(VALUE*)malloc(elems*sizeof(VALUE)); VALUE *sum=(VALUE*)malloc(sumBuffer_length_downSweep*sizeof(VALUE)); VALUE *sum_hillis=(VALUE*)malloc(sumBuffer_length_hillis*sizeof(VALUE)); memset(sum_hillis,0,sumBuffer_length_hillis*sizeof(VALUE)); memset(result_seq,0,elems*sizeof(VALUE)); // initialize data set (fill randomly) for(int j=0; j<elems; ++j) { data[j] =rand()%121; } // printResult(data, elems, 4, "INPUT"); /*Sequential Scan*/ for(int i=1; i<elems; i++){ result_seq[i]=result_seq[i-1]+data[i-1]; } // printResult(result_seq, elems, 4, "Sequential Algorithm OUTPUT"); //ocl initialization size_t deviceInfo; cl_context context; cl_command_queue command_queue; cl_device_id device_id = cluInitDevice(CL_DEVICE, &context, &command_queue); clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(size_t), &deviceInfo,NULL ); // create memory buffer cl_mem mem_data=clCreateBuffer(context, CL_MEM_READ_ONLY| CL_MEM_USE_HOST_PTR,elems*sizeof(VALUE), data, &err); cl_mem mem_data_hillis=clCreateBuffer(context, CL_MEM_READ_ONLY| CL_MEM_USE_HOST_PTR,elems*sizeof(VALUE), data, &err); cl_mem mem_result=clCreateBuffer(context, CL_MEM_READ_WRITE, elems*sizeof(VALUE), NULL,&err); cl_mem mem_result_tmp=clCreateBuffer(context, CL_MEM_READ_WRITE, elems*sizeof(VALUE), NULL,&err); cl_mem mem_sum=clCreateBuffer(context, CL_MEM_READ_WRITE, sumBuffer_length_downSweep*sizeof(VALUE), NULL, &err); cl_mem mem_sum_hillis=clCreateBuffer(context, CL_MEM_READ_WRITE, sumBuffer_length_hillis*sizeof(VALUE), NULL, &err); CLU_ERRCHECK(err, "Failed to create Buffer"); err=clEnqueueWriteBuffer(command_queue, mem_sum_hillis, CL_TRUE, 0, sumBuffer_length_hillis*sizeof(VALUE), sum_hillis, 0, NULL, NULL); CLU_ERRCHECK(err, "Failed to write values into mem_sum"); // create kernel from source char tmp[1024]; sprintf(tmp,"-DVALUE=%s", EXPAND_AND_QUOTE(VALUE)); cl_program program = cluBuildProgramFromFile(context, device_id, KERNEL_FILE_NAME, tmp); cl_kernel kernel_downSweep = clCreateKernel(program, "prefix_scan_downSweep", &err); cl_kernel kernel_hillissteele=clCreateKernel(program, "prefix_scan_hillissteele", &err); cl_kernel kernel_last_stage= clCreateKernel(program, "prefix_scan_last_stage", &err); CLU_ERRCHECK(err,"Could not load source program"); /*-------------------------------------DOWNSWEEP-----------------------------------------------*/ // set arguments int border=elems/2; int flag=1; cluSetKernelArguments(kernel_downSweep, 6, sizeof(cl_mem), (void *)&mem_data, sizeof(cl_mem), (void*)&mem_result, sizeof(cl_mem), (void*)&mem_sum,sizeof(VALUE)*LOCALSIZE*2, NULL, sizeof(int), (void*)&border, sizeof(int), (void*)&flag); //execute kernel CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_downSweep, 1, NULL, globalWorkGroupSize_downSweep, localWorkGroupSize_downSweep, 0, NULL, &(events[1])), "DownSweep_Failed to enqueue 2D kernel"); clFinish(command_queue); total_downsweep+=getProfileTotalTime(events,1); //read values back from device CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_result, CL_TRUE, 0, elems*sizeof(VALUE), result, 0, NULL, NULL),"DownSweep_Failed to read Result Values"); /* CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_sum, CL_TRUE, 0, sumBuffer_length_downSweep*sizeof(VALUE), sum, 0, NULL, NULL),"Failed to read Sum Values"); clFinish(command_queue); printSumBuffer(sum, sumBuffer_length_downSweep,"DOWNSWEEP SUM"); */ err=clEnqueueCopyBuffer(command_queue, mem_result, mem_result_tmp, 0, 0, elems*sizeof(VALUE),0,NULL,NULL); CLU_ERRCHECK(err,"DownSweep_Failed during copying buffer"); /*+++++++++++++++++++++++++++++++++DOWNSWEEP-ON-SUM-BUFFER+++++++++++++++++++++++++++++++++++++++*/ flag=0; border=sumBuffer_length_downSweep/2; //since sumbuffer_length is power of two no further adaption is needed cluSetKernelArguments(kernel_downSweep, 6, sizeof(cl_mem), (void *)&mem_sum, sizeof(cl_mem), (void*)&mem_sum, sizeof(cl_mem), (void*)&mem_sum,sizeof(VALUE)*sumBuffer_length_downSweep, NULL, sizeof(int), (void*)&border, sizeof(int), (void*)&flag); howManyWorkGroups>1 ? globalWorkGroupSize_downSweep[0]=howManyWorkGroups/2:howManyWorkGroups; //if 1 workgroup make adaption howManyWorkGroups>1 ? localWorkGroupSize_downSweep[0]=howManyWorkGroups/2:howManyWorkGroups; //if 1 workgroup make adaption //execute kernel CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_downSweep, 1, NULL, globalWorkGroupSize_downSweep, localWorkGroupSize_downSweep, 0, NULL,&(events[1])), "DownSweep_Failed to enqueue 2D kernel"); clFinish(command_queue); total_downsweep+=getProfileTotalTime(events,1); /* CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_sum, CL_TRUE, 0, sumBuffer_length_downSweep*sizeof(VALUE), sum, 0, NULL, NULL),"Failed to read Sum Values"); printSumBuffer(sum, sumBuffer_length_downSweep,"DOWNSWEEP SUM PREFIX"); */ /*+++++++++++++++++++++++++++++++++DOWNSWEEP-LAST-STAGE(Add Sums)++++++++++++++++++++++++++++++++++++++++*/ border=sumBuffer_length_downSweep; flag=1; cluSetKernelArguments(kernel_last_stage, 4, sizeof(cl_mem), (void *)&mem_result_tmp, sizeof(cl_mem), (void*)&mem_sum, sizeof(int), (void*)&border, sizeof(int), (void*)&flag); globalWorkGroupSize_downSweep[0]=getPowerOfTwo(roundUp(LOCALSIZE,roundUp(LOCALSIZE, elems)/2)); localWorkGroupSize_downSweep[0]=LOCALSIZE; //printf("GLOBALSIZE: %d\tLOCALSIZE %d\n",globalWorkGroupSize[0],localWorkGroupSize[0]); //execute kernel CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_last_stage, 1, NULL, globalWorkGroupSize_downSweep, localWorkGroupSize_downSweep, 0, NULL, &(events[1])), "DownSweep_Failed to enqueue 2D kernel"); clFinish(command_queue); total_downsweep+=getProfileTotalTime(events,1); //read values back from device CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_result_tmp, CL_TRUE, 0, elems*sizeof(VALUE), result, 0, NULL, NULL),"DownSweep_Failed to read Result Values"); /*---------------------------------------HILLISSTEELE----------------------------------------------------------*/ flag=1; border=elems; cluSetKernelArguments(kernel_hillissteele, 6, sizeof(cl_mem), (void *)&mem_data_hillis, sizeof(cl_mem), (void*)&mem_result, sizeof(cl_mem), (void*)&mem_sum_hillis,sizeof(VALUE)*LOCALSIZE*2, NULL, sizeof(int), (void*)&border, sizeof(int), (void*)&flag); //execute kernel //printf("GlobalSize: %d\tLocalWorkGroupSize: %d\n",globalWorkGroupSize[0], localWorkGroupSize[0]); //printf("Amount of WorkGroups: %d\n", globalWorkGroupSize[0]/localWorkGroupSize[0]); CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_hillissteele, 1, NULL, globalWorkGroupSize_hillissteele, localWorkGroupSize_hillissteele, 0, NULL, &(events[0])), "Hillissteele_Failed to enqueue 2D kernel_Inputbuffer"); clFinish(command_queue); total_hillissteele+=getProfileTotalTime(events,0); //read values back from device /* CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_result, CL_TRUE, 0, elems*sizeof(VALUE), result_hillissteele, 0, NULL, NULL),"Failed to read Result Values"); CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_sum_hillis, CL_TRUE, 0, sumBuffer_length_hillis*sizeof(VALUE), sum_hillis, 0, NULL, NULL),"Failed to read Sum_1 Values"); printSumBuffer(sum_hillis, sumBuffer_length_hillis, "HILLISSTEELE SUM"); printResult(result_hillissteele,elems, 4, "HILLISSTEELE Temporary OUTPUT"); */ /*++++++++++++++++++++++++++++++++++++++HILLISSTEELE-ON-SUM-BUFFER+++++++++++++++++++++++++++++++++++++*/ flag=0; border=sumBuffer_length_hillis; cluSetKernelArguments(kernel_hillissteele, 6, sizeof(cl_mem), (void *)&mem_sum_hillis, sizeof(cl_mem), (void*)&mem_sum_hillis, sizeof(cl_mem), (void*)&mem_sum_hillis,sizeof(VALUE)*howManyWorkGroups*2, NULL, sizeof(int), (void*)&border, sizeof(int), (void*)&flag); //execute kernel globalWorkGroupSize_hillissteele[0]=sumBuffer_length_hillis; localWorkGroupSize_hillissteele[0]=sumBuffer_length_hillis; CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_hillissteele, 1, NULL, globalWorkGroupSize_hillissteele, localWorkGroupSize_hillissteele, 0, NULL, &(events[0])), "Hillissteele_Failed to enqueue 2D kernel_Sumbuffer"); clFinish(command_queue); total_hillissteele+=getProfileTotalTime(events,0); //read values back from device /* CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_sum_hillis, CL_TRUE, 0, sumBuffer_length_hillis*sizeof(VALUE), sum_hillis, 0, NULL, NULL),"Failed to read Sum2 Values"); printSumBuffer(sum_hillis, sumBuffer_length_hillis, "HILLISSTEELE SUM PREFIX"); */ /*+++++++++++++++++++++++++++++++++++++HILLISSTEELE-LAST-STAGE(Add Sums)++++++++++++++++++++++++++++++++++++++++*/ flag=0; border=sumBuffer_length_hillis; cluSetKernelArguments(kernel_last_stage, 4, sizeof(cl_mem), (void *)&mem_result, sizeof(cl_mem), (void*)&mem_sum_hillis, sizeof(int), (void*)&border, sizeof(int), (void*)&flag); globalWorkGroupSize_hillissteele[0]=roundUp(LOCALSIZE,elems); localWorkGroupSize_hillissteele[0]=LOCALSIZE; //printf("GLOBALSIZE: %d\tLOCALSIZE %d\n",globalWorkGroupSize[0],localWorkGroupSize[0]); //execute kernel CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_last_stage, 1, NULL, globalWorkGroupSize_hillissteele, localWorkGroupSize_hillissteele, 0, NULL, &(events[0])), "Hillissteele_Failed to enqueue kernel_Last_stage"); clFinish(command_queue); total_hillissteele+=getProfileTotalTime(events,0); //read values back from device CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_result, CL_TRUE, 0, elems*sizeof(VALUE), result_hillissteele, 0, NULL, NULL),"Hillissteele_Failed to read Result Values"); /*-------------------------FINISHED---------------------------------------------*/ //printResult(result_hillissteele, elems, 4, "HILLISSTEELE OUTPUT"); //printResult(result, elems, 4, "IMPROVED IMPLEMENTATION OUTPUT"); //verify results verifyResult(result_seq,result,elems, "Verifying result of DownSweep for bigger array size"); verifyResult(result_seq,result_hillissteele,elems, "Verifying result of HILLISSTEELE for bigger array size"); printProfileInfo(total_downsweep,"Improved Algorithm Time:"); printProfileInfo(total_hillissteele,"Hillis & Steele Time:"); printf("\nDEVICE INFO MAX_WORK_GROUP_SIZE: %d\n", (int) deviceInfo); printf("OCL Device: %s\n", cluGetDeviceDescription(device_id, CL_DEVICE)); printf("Done, took %16llu ms\n", time_ms()-start_time); // finalization for(int i=0; i<event_amount; i++){ clReleaseEvent(events[i]); } err = clFinish(command_queue); err |= clReleaseKernel(kernel_downSweep); err |= clReleaseKernel(kernel_last_stage); err |= clReleaseKernel(kernel_hillissteele); err |= clReleaseProgram(program); err |= clReleaseMemObject(mem_data); err |= clReleaseMemObject(mem_data_hillis); err |= clReleaseMemObject(mem_result); err |= clReleaseMemObject(mem_result_tmp); err |= clReleaseMemObject(mem_sum); err |= clReleaseMemObject(mem_sum_hillis); err |= clReleaseCommandQueue(command_queue); err |= clReleaseContext(context); CLU_ERRCHECK(err, "Failed during ocl cleanup"); free(events); free(result); free(result_hillissteele); free(result_seq); free(sum); free(sum_hillis); return EXIT_SUCCESS; }