void segmented_scan_init(size_t _wx, UINT maxN, icl_device *_dev, const char* build_options, icl_create_kernel_flag flag) { /* : clw(clw), wx(wx), m0(0), m1(0), m2(0), m3(0), k0(0), k1(0), k2(0) { */ dev = _dev; wx = _wx; // overapproximation for allocation, using maximum allowed size for n UINT numWorkGroups = ((maxN + wx - 1) / wx); UINT sizeScanBuff = ((numWorkGroups + wx -1) / wx) * wx; const UINT buf_1 = sizeof(UINT)*sizeScanBuff; preSumArray = icl_create_buffer(dev, CL_MEM_READ_WRITE, buf_1); postSumArray = icl_create_buffer(dev, CL_MEM_READ_WRITE, buf_1); keySumArray = icl_create_buffer(dev, CL_MEM_READ_WRITE, buf_1); perBlockScanByKey = icl_create_kernel(dev, "kernel/boltScan.cl", "perBlockScanByKey", build_options, flag); intraBlockInclusiveScanByKey = icl_create_kernel(dev, "kernel/boltScan.cl", "intraBlockInclusiveScanByKey", build_options, flag); perBlockAdditionByKey = icl_create_kernel(dev, "kernel/boltScan.cl", "perBlockAdditionByKey", build_options, flag); perBlockScanEvent = icl_create_event(); intraBlockEvent = icl_create_event(); perBlockAdditionEvent = icl_create_event(); #if TIMING perBlockScanTime = 0; intraBlockTime = 0; perBlockAdditionTime = 0; timer = icl_init_timer(ICL_MILLI); #endif }
int main(int argc, char* argv[]) { int size = 1000; int* input1 = (int*)malloc(sizeof(int) * size); int* input2 = (int*) malloc(sizeof(int) * size); int* output = (int *)malloc(sizeof(int) * size); for(int i=0; i < size; ++i) { input1[i] = i; input2[i] = 1; } #ifndef INSIEME icl_timer* time1 = icl_init_timer(ICL_SEC); icl_start_timer(time1); #endif icl_init_devices(ICL_CPU); #ifndef INSIEME printf("TIME for initialization: %f\n", icl_stop_timer(time1)); #endif if (icl_get_num_devices() != 0) { icl_device* dev = icl_get_device(0); icl_print_device_short_info(dev); icl_kernel* kernel = icl_create_kernel(dev, "vec_mul.cl", "vec_mul", "", ICL_SOURCE); icl_buffer* buf_input1 = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(int) * size); icl_buffer* buf_input2 = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(int) * size); icl_buffer* buf_output = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(int) * size); icl_event* wb1 = icl_create_event(); icl_event* wb2 = icl_create_event(); icl_event* rb = icl_create_event(); icl_write_buffer(buf_input1, CL_FALSE, sizeof(int) * size, &input1[0], NULL, wb1); icl_write_buffer(buf_input2, CL_FALSE, sizeof(int) * size, &input2[0], NULL, wb2); size_t szLocalWorkSize = 256; float multiplier = size/(float)szLocalWorkSize; if(multiplier > (int)multiplier) multiplier += 1; size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize; icl_event* rk = icl_create_event(); icl_event* wb_all = icl_create_event_list(2, wb1, wb2); icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, wb_all, rk, 4, (size_t)0, (void *)buf_input1, (size_t)0, (void *)buf_input2, (size_t)0, (void *)buf_output, sizeof(cl_int), (void *)&size); icl_read_buffer(buf_output, CL_TRUE, sizeof(int) * size, &output[0], rk, rb); printf("Time wb1 %f\n", icl_profile_event(wb1, ICL_STARTED, ICL_FINISHED, ICL_SEC)); printf("Time wb2 %f\n", icl_profile_event(wb2, ICL_STARTED, ICL_FINISHED, ICL_SEC)); printf("Time rk %f\n", icl_profile_event(rk, ICL_STARTED, ICL_FINISHED, ICL_SEC)); printf("Time rb %f\n", icl_profile_event(rb, ICL_STARTED, ICL_FINISHED, ICL_SEC)); icl_release_events(5, wb1, wb2, wb_all, rk, rb); icl_release_buffers(3, buf_input1, buf_input2, buf_output); icl_release_kernel(kernel); } #ifndef INSIEME icl_restart_timer(time1); #endif icl_release_devices(); #ifndef INSIEME printf("TIME for releasing the devices: %f\n", icl_stop_timer(time1)); icl_release_timer(time1); #endif // CHECK for output printf("======================\n= Vector Mul Done\n"); unsigned int check = 1; for(unsigned int i = 0; i < size; ++i) { if(output[i] != i*size) { check = 0; printf("= fail at %d, expected %d / actual %d", i, i*3/2, output[i]); break; } } printf("= result check: %s\n======================\n", check ? "OK" : "FAIL"); free(input1); free(input2); free(output); }