void scan(icl_buffer *data, icl_buffer *flag, UINT n) { // use size for actual n, not overapproximation as in allocation UINT numWorkGroups = ((n + wx - 1) / wx); UINT sizeScanBuff = ((numWorkGroups + wx -1) / wx) * wx; size_t gx = numWorkGroups * wx; INT init = -1; #if TIMING clFinish(dev->queue); icl_start_timer(timer); #endif icl_run_kernel(perBlockScanByKey, 1, &gx, &wx, NULL, perBlockScanEvent, 8, (size_t)0, (void *)flag, (size_t)0, (void *)data, sizeof(INT), &init, sizeof(UINT), &n, sizeof(UINT) * wx, NULL, sizeof(INT) * wx, NULL, (size_t)0, (void *)keySumArray, (size_t)0, (void *)preSumArray); /* V_OPENCL( kernels[0].setArg( 0, firstKey.getBuffer()), "Error setArg kernels[ 0 ]" ); // Input keys V_OPENCL( kernels[0].setArg( 1, firstValue.getBuffer()),"Error setArg kernels[ 0 ]" ); // Input buffer V_OPENCL( kernels[0].setArg( 2, result.getBuffer( ) ), "Error setArg kernels[ 0 ]" ); // Output buffer V_OPENCL( kernels[0].setArg( 3, init ), "Error setArg kernels[ 0 ]" ); // Initial value exclusive V_OPENCL( kernels[0].setArg( 4, numElements ), "Error setArg kernels[ 0 ]" ); // Size of scratch buffer V_OPENCL( kernels[0].setArg( 5, ldsKeySize, NULL ), "Error setArg kernels[ 0 ]" ); // Scratch buffer V_OPENCL( kernels[0].setArg( 6, ldsValueSize, NULL ), "Error setArg kernels[ 0 ]" ); // Scratch buffer V_OPENCL( kernels[0].setArg( 7, *binaryPredicateBuffer),"Error setArg kernels[ 0 ]" ); // User provided functor V_OPENCL( kernels[0].setArg( 8, *binaryFunctionBuffer ),"Error setArg kernels[ 0 ]" ); // User provided functor V_OPENCL( kernels[0].setArg( 9, *keySumArray ), "Error setArg kernels[ 0 ]" ); // Output per block sum V_OPENCL( kernels[0].setArg(10, *preSumArray ), "Error setArg kernels[ 0 ]" ); // Output per block sum V_OPENCL( kernels[0].setArg(11, doExclusiveScan ), "Error setArg kernels[ 0 ]" ); // Exclusive scan? */ UINT workPerThread = sizeScanBuff / wx; icl_run_kernel(intraBlockInclusiveScanByKey, 1, &wx, &wx, NULL, intraBlockEvent, 7, (size_t)0, (void *)keySumArray, (size_t)0, (void *)preSumArray, (size_t)0, (void *)postSumArray, sizeof(UINT), &numWorkGroups, sizeof(UINT) * wx, NULL, sizeof(INT) * wx, NULL, sizeof(UINT), &workPerThread); /* INT* output = (UINT*)malloc(n * sizeof(INT)); icl_read_buffer(keySumArray, CL_TRUE, n * sizeof(INT), output, NULL, NULL); for(int i = 0; i < n; ++i) printf("%d ", output[i]); printf("\n"); */ /* V_OPENCL( kernels[1].setArg( 0, *keySumArray ), "Error setArg kernels[ 1 ]" ); // Input keys V_OPENCL( kernels[1].setArg( 1, *preSumArray ), "Error setArg kernels[ 1 ]" ); // Input buffer V_OPENCL( kernels[1].setArg( 2, *postSumArray ), "Error setArg kernels[ 1 ]" ); // Output buffer V_OPENCL( kernels[1].setArg( 3, numWorkGroupsK0 ), "Error setArg kernels[ 1 ]" ); // Size of scratch buffer V_OPENCL( kernels[1].setArg( 4, ldsKeySize, NULL ), "Error setArg kernels[ 1 ]" ); // Scratch buffer V_OPENCL( kernels[1].setArg( 5, ldsValueSize, NULL ), "Error setArg kernels[ 1 ]" ); // Scratch buffer V_OPENCL( kernels[1].setArg( 6, workPerThread ), "Error setArg kernels[ 1 ]" ); // User provided functor V_OPENCL( kernels[1].setArg( 7, *binaryPredicateBuffer ),"Error setArg kernels[ 1 ]" ); // User provided functor V_OPENCL( kernels[1].setArg( 8, *binaryFunctionBuffer ),"Error setArg kernels[ 1 ]" ); // User provided functor */ icl_run_kernel(perBlockAdditionByKey, 1, &gx, &wx, NULL, perBlockAdditionEvent, 5, (size_t)0, (void *)keySumArray, (size_t)0, (void *)postSumArray, (size_t)0, (void *)flag, (size_t)0, (void *)data, sizeof(UINT), &n); /* V_OPENCL( kernels[2].setArg( 0, *keySumArray ), "Error setArg kernels[ 2 ]" ); // Input buffer V_OPENCL( kernels[2].setArg( 1, *postSumArray ), "Error setArg kernels[ 2 ]" ); // Input buffer V_OPENCL( kernels[2].setArg( 2, firstKey.getBuffer()), "Error setArg kernels[ 2 ]" ); // Output buffer V_OPENCL( kernels[2].setArg( 3, result.getBuffer()), "Error setArg kernels[ 2 ]" ); // Output buffer V_OPENCL( kernels[2].setArg( 4, numElements ), "Error setArg kernels[ 2 ]" ); // Size of scratch buffer V_OPENCL( kernels[2].setArg( 5, *binaryPredicateBuffer ),"Error setArg kernels[ 2 ]" ); // User provided functor V_OPENCL( kernels[2].setArg( 6, *binaryFunctionBuffer ),"Error setArg kernels[ 2 ]" ); // User provided functor */ #if TIMING clFinish(dev->queue); icl_stop_timer(timer); perBlockScanTime += icl_profile_event(perBlockScanEvent, MEASURE_START, ICL_FINISHED, ICL_MILLI); intraBlockTime += icl_profile_event(intraBlockEvent, MEASURE_START, ICL_FINISHED, ICL_MILLI); perBlockAdditionTime += icl_profile_event(perBlockAdditionEvent, MEASURE_START, ICL_FINISHED, ICL_MILLI); #endif }
int main(int argc, char* argv[]) { int size = 1000; int* input1 = (int*)malloc(sizeof(int) * size); int* input2 = (int*) malloc(sizeof(int) * size); int* output = (int *)malloc(sizeof(int) * size); for(int i=0; i < size; ++i) { input1[i] = i; input2[i] = 1; } #ifndef INSIEME icl_timer* time1 = icl_init_timer(ICL_SEC); icl_start_timer(time1); #endif icl_init_devices(ICL_CPU); #ifndef INSIEME printf("TIME for initialization: %f\n", icl_stop_timer(time1)); #endif if (icl_get_num_devices() != 0) { icl_device* dev = icl_get_device(0); icl_print_device_short_info(dev); icl_kernel* kernel = icl_create_kernel(dev, "vec_mul.cl", "vec_mul", "", ICL_SOURCE); icl_buffer* buf_input1 = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(int) * size); icl_buffer* buf_input2 = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(int) * size); icl_buffer* buf_output = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(int) * size); icl_event* wb1 = icl_create_event(); icl_event* wb2 = icl_create_event(); icl_event* rb = icl_create_event(); icl_write_buffer(buf_input1, CL_FALSE, sizeof(int) * size, &input1[0], NULL, wb1); icl_write_buffer(buf_input2, CL_FALSE, sizeof(int) * size, &input2[0], NULL, wb2); size_t szLocalWorkSize = 256; float multiplier = size/(float)szLocalWorkSize; if(multiplier > (int)multiplier) multiplier += 1; size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize; icl_event* rk = icl_create_event(); icl_event* wb_all = icl_create_event_list(2, wb1, wb2); icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, wb_all, rk, 4, (size_t)0, (void *)buf_input1, (size_t)0, (void *)buf_input2, (size_t)0, (void *)buf_output, sizeof(cl_int), (void *)&size); icl_read_buffer(buf_output, CL_TRUE, sizeof(int) * size, &output[0], rk, rb); printf("Time wb1 %f\n", icl_profile_event(wb1, ICL_STARTED, ICL_FINISHED, ICL_SEC)); printf("Time wb2 %f\n", icl_profile_event(wb2, ICL_STARTED, ICL_FINISHED, ICL_SEC)); printf("Time rk %f\n", icl_profile_event(rk, ICL_STARTED, ICL_FINISHED, ICL_SEC)); printf("Time rb %f\n", icl_profile_event(rb, ICL_STARTED, ICL_FINISHED, ICL_SEC)); icl_release_events(5, wb1, wb2, wb_all, rk, rb); icl_release_buffers(3, buf_input1, buf_input2, buf_output); icl_release_kernel(kernel); } #ifndef INSIEME icl_restart_timer(time1); #endif icl_release_devices(); #ifndef INSIEME printf("TIME for releasing the devices: %f\n", icl_stop_timer(time1)); icl_release_timer(time1); #endif // CHECK for output printf("======================\n= Vector Mul Done\n"); unsigned int check = 1; for(unsigned int i = 0; i < size; ++i) { if(output[i] != i*size) { check = 0; printf("= fail at %d, expected %d / actual %d", i, i*3/2, output[i]); break; } } printf("= result check: %s\n======================\n", check ? "OK" : "FAIL"); free(input1); free(input2); free(output); }