void ocl_init(int deviceId, int nr_lines, size_t field_nr_points,int linelength, int fielddim_x,int fielddim_y,int fielddim_z, bool regular, bool outofcore) { #ifdef CUDA printf("CUDA version\n"); #else printf("OpenCL version\n"); #endif icl_init_devices(ICL_ALL); size_t deviceNum = icl_get_num_devices(); if (deviceNum != 0) { for(int i=0; i<deviceNum; i++){ icl_device *temp = icl_get_device(i); printf("%d - ", i); icl_print_device_short_info(temp); } device = icl_get_device(deviceId); printf("\nSelected device: "); icl_print_device_short_info(device); char comp_flags[256]; sprintf(comp_flags, "-D LOCAL_GROUP_SIZE=%d -D LOOP_UNROLL=%d", LOCAL_GROUP_SIZE, LOOP_UNROLL); reg_field1_kernel = icl_create_kernel(device, "../turning_band.cl", "make_reg_field1", comp_flags, ICL_SOURCE); reg_field1b_kernel = icl_create_kernel(device, "../turning_band.cl", "make_reg_field1_blocking", comp_flags, ICL_SOURCE); reg_field1u_kernel = icl_create_kernel(device, "../turning_band.cl", "make_reg_field1_unroll", comp_flags, ICL_SOURCE); reg_field1bu_kernel = icl_create_kernel(device, "../turning_band.cl", "make_reg_field1_blocking_unroll", comp_flags, ICL_SOURCE); reg_field2_kernel = icl_create_kernel(device, "../turning_band.cl", "make_reg_field2", comp_flags, ICL_SOURCE); reg_field3_kernel = icl_create_kernel(device, "../turning_band.cl", "make_reg_field3", comp_flags, ICL_SOURCE); reg_field_outofcore_kernel = icl_create_kernel(device, "../turning_band.cl", "make_reg_field_outofcore_blocking", comp_flags, ICL_SOURCE); irr_field_kernel = icl_create_kernel(device, "../turning_band.cl", "make_irr_field", comp_flags, ICL_SOURCE); irr_fieldb_kernel = icl_create_kernel(device, "../turning_band.cl", "make_irr_field_blocking", comp_flags, ICL_SOURCE); irr_fieldu_kernel = icl_create_kernel(device, "../turning_band.cl", "make_irr_field_unroll", comp_flags, ICL_SOURCE); irr_fieldbu_kernel = icl_create_kernel(device, "../turning_band.cl", "make_irr_field_blocking_unroll", comp_flags, ICL_SOURCE); vecs_buf = icl_create_buffer(device, CL_MEM_READ_ONLY, sizeof(cl_double4) * nr_lines); y_buf = icl_create_buffer(device, CL_MEM_READ_ONLY, sizeof(cl_double) * linelength * nr_lines + 1); if(regular) { size_t _size = outofcore ? (OUTOFCORE_SIZE) : (fielddim_x * fielddim_y * fielddim_z); RF_buf = icl_create_buffer(device, CL_MEM_READ_WRITE, sizeof(cl_double) * _size); } else RF_buf = icl_create_buffer(device, CL_MEM_READ_WRITE, sizeof(cl_double4) * field_nr_points); icl_print_device_infos(device); } else { fprintf(stderr, "Error: OpenCL device not found"); exit(1); // failure exit } }
int main(int argc, const char* argv[]) { icl_args* args = icl_init_args(); icl_parse_args(argc, argv, args); chdir(PATH); int size = args->size; icl_print_args(args); cl_float4* output = (cl_float4*)malloc(sizeof(cl_float4) * size); icl_init_devices(args->device_type); icl_start_energy_measurement(); if (icl_get_num_devices() != 0) { icl_device* dev = icl_get_device(args->device_id); icl_print_device_short_info(dev); icl_kernel* kernel = icl_create_kernel(dev, "sinewave.cl", "sinewave", "", ICL_SOURCE); size_t szLocalWorkSize = args->local_size; float multiplier = size/(float)szLocalWorkSize; if(multiplier > (int)multiplier) multiplier += 1; size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize; for (int i = 0; i < args->loop_iteration; ++i) { icl_buffer* buf_output = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(cl_float4) * size); icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 2, (size_t)0, (void *)buf_output, sizeof(cl_int), (void *)&size); icl_read_buffer(buf_output, CL_TRUE, sizeof(cl_float4) * size, &output[0], NULL, NULL); icl_release_buffer(buf_output); } icl_release_kernel(kernel); } icl_stop_energy_measurement(); // for the test check printf("Result check: OK\n"); icl_release_args(args); icl_release_devices(); free(output); }
int main(int argc, const char* argv[]) { icl_args* args = icl_init_args(); icl_parse_args(argc, argv, args); icl_print_args(args); chdir(PATH); int size = args->size; int* input = (int*)malloc(sizeof(int) * size); int* output = (int *)malloc(sizeof(int) * size); for(int i=0; i < size; ++i) { input[i] = i; } icl_init_devices(ICL_ALL); icl_start_energy_measurement(); if (icl_get_num_devices() != 0) { icl_device* dev = icl_get_device(0); icl_print_device_short_info(dev); icl_kernel* kernel = icl_create_kernel(dev, "simple.cl", "simple", "", ICL_SOURCE); size_t szLocalWorkSize = args->local_size; float multiplier = size/(float)szLocalWorkSize; if(multiplier > (int)multiplier) multiplier += 1; size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize; for (int i = 0; i < args->loop_iteration; ++i) { icl_buffer* buf_input = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(int) * size); icl_buffer* buf_output = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(int) * size); icl_write_buffer(buf_input, CL_TRUE, sizeof(int) * size, &input[0], NULL, NULL); icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 3, (size_t)0, (void *)buf_input, (size_t)0, (void *)buf_output, sizeof(cl_int), (void *)&size); icl_read_buffer(buf_output, CL_TRUE, sizeof(int) * size, &output[0], NULL, NULL); icl_release_buffers(2, buf_input, buf_output); } icl_release_kernel(kernel); } icl_stop_energy_measurement(); if (args->check_result) { printf("======================\n= Simple program working\n"); unsigned int check = 1; for(unsigned int i = 0; i < size; ++i) { if(output[i] != input[i]) { check = 0; printf("= fail at %d, expected %d / actual %d", i, i, output[i]); break; } } printf("======================\n"); printf("Result check: %s\n", check ? "OK" : "FAIL"); } else { printf("Result check: OK\n"); } icl_release_args(args); icl_release_devices(); free(input); free(output); }
int main(int argc, const char* argv[]) { icl_args* args = icl_init_args(); icl_parse_args(argc, argv, args); icl_print_args(args); chdir(PATH); int size = args->size; float* input1 = (float*) malloc(sizeof(float) * size); float* input2 = (float*) malloc(sizeof(float) * size); float* alpha = (float*) malloc(sizeof(float) * size); float* beta = (float*) malloc(sizeof(float) * size); float* output = (float*) malloc(sizeof(float) * size); fill_random_float(input2, size, 1, -1.0f, 1.0f); qsort(input2, size, sizeof(float), float_compare); float step = 2.0f / size; for(int i=0; i < size; i++) input1[i] = -1.0f + i * step; fill_random_float(alpha, size, 1, -1.0f, 1.0f); fill_random_float(beta, size, 1, -1.0f, 1.0f); icl_init_devices(args->device_type); icl_start_energy_measurement(); if (icl_get_num_devices() != 0) { icl_device* dev = icl_get_device(args->device_id); icl_print_device_short_info(dev); icl_kernel* kernel = icl_create_kernel(dev, "lin_reg.cl", "lin_reg", "", ICL_SOURCE); size_t szLocalWorkSize = args->local_size; float multiplier = size/(float)szLocalWorkSize; if(multiplier > (int)multiplier) multiplier += 1; size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize; for (int i = 0; i < args->loop_iteration; ++i) { icl_buffer* buf_input1 = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(float) * size); icl_buffer* buf_input2 = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(float) * size); icl_buffer* buf_alpha = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(float) * size); icl_buffer* buf_beta = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(float) * size); icl_buffer* buf_output = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(float) * size); icl_write_buffer(buf_input1, CL_TRUE, sizeof(float) * size, &input1[0], NULL, NULL); icl_write_buffer(buf_input2, CL_TRUE, sizeof(float) * size, &input2[0], NULL, NULL); icl_write_buffer(buf_alpha, CL_TRUE, sizeof(float) * size, &alpha[0], NULL, NULL); icl_write_buffer(buf_beta, CL_TRUE, sizeof(float) * size, &beta[0], NULL, NULL); icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 6, (size_t)0, (void *)buf_input1, (size_t)0, (void *)buf_input2, (size_t)0, (void *)buf_alpha, (size_t)0, (void *)buf_beta, (size_t)0, (void *)buf_output, sizeof(cl_int), (void *)&size); icl_read_buffer(buf_output, CL_TRUE, sizeof(float) * size, &output[0], NULL, NULL); icl_release_buffers(5, buf_input1, buf_input2, buf_alpha, buf_beta, buf_output); } icl_release_kernel(kernel); } icl_stop_energy_measurement(); if (args->check_result) { printf("======================\n= Linear Regression Done\n"); float* output2 = (float *)malloc(sizeof(float) * size); for(unsigned int j = 0; j < size; ++j) { const int gid = j; float a = alpha[gid]; float b = beta[gid]; float error = 0; for(int i=0; i<size; i++) { float e = (a * input1[i] + b) - input2[i]; error += e * e; } output2[gid] = error; } bool check = compare_float(output, output2, size, 0.000001); printf("======================\n"); printf("Result check: %s\n", check ? "OK" : "FAIL"); free(output2); } else { printf("Result check: OK\n"); } icl_release_args(args); icl_release_devices(); free(input1); free(input2); free(alpha); free(beta); free(output); }
int main(int argc, const char* argv[]) { chdir(PATH); icl_args* args = icl_init_args(); icl_parse_args(argc, argv, args); icl_print_args(args); int size = args->size; cl_uint* ma = (cl_uint*) malloc(sizeof(cl_uint) * size); cl_uint* b = (cl_uint*) malloc(sizeof(cl_uint) * size); cl_uint* c = (cl_uint*) malloc(sizeof(cl_uint) * size); cl_uint* seed = (cl_uint*) malloc(sizeof(cl_uint) * size); cl_float4* result = (cl_float4*)malloc(sizeof(cl_float4) * size); for (cl_uint i = 0; i < size; ++i) { ma[i] = i; b[i] = i; c[i] = i; seed[i] = i; } icl_init_devices(args->device_type); icl_start_energy_measurement(); if (icl_get_num_devices() != 0) { icl_device* dev = icl_get_device(args->device_id); icl_print_device_short_info(dev); icl_kernel* kernel = icl_create_kernel(dev, "mers_twister.cl", "mersenne_twister", "", ICL_SOURCE); size_t szLocalWorkSize = args->local_size; float multiplier = size/(float)szLocalWorkSize; if(multiplier > (int)multiplier) multiplier += 1; size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize; for (int i = 0; i < args->loop_iteration; ++i) { icl_buffer* buf_ma = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(cl_uint) * size); icl_buffer* buf_b = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(cl_uint) * size); icl_buffer* buf_c = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(cl_uint) * size); icl_buffer* buf_seed = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(cl_uint) * size); icl_buffer* buf_result = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(cl_float4) * size); icl_write_buffer(buf_ma, CL_FALSE, sizeof(cl_uint) * size, &ma[0], NULL, NULL); icl_write_buffer(buf_b, CL_FALSE, sizeof(cl_uint) * size, &b[0], NULL, NULL); icl_write_buffer(buf_c, CL_FALSE, sizeof(cl_uint) * size, &c[0], NULL, NULL); icl_write_buffer(buf_seed, CL_TRUE, sizeof(cl_uint) * size, &seed[0], NULL, NULL); icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 6, (size_t)0, (void *)buf_ma, (size_t)0, (void *)buf_b, (size_t)0, (void *)buf_c, (size_t)0, (void *)buf_seed, (size_t)0, (void *)buf_result, sizeof(cl_int), (void *)&size ); icl_read_buffer(buf_result, CL_TRUE, sizeof(cl_float4) * size, &result[0], NULL, NULL); icl_release_buffers(5, buf_ma, buf_b, buf_c, buf_seed, buf_result); } icl_release_kernel(kernel); } icl_stop_energy_measurement(); if (args->check_result) { printf("======================\n= mersenne twister test\n"); printf("Check not Implemented\n"); printf("Result check: OK\n"); } else { printf("Result check: OK\n"); } icl_release_devices(); free(ma); free(b); free(c); free(seed); free(result); return 0; }
int main (int argc, char **argv) { struct BBox box; struct Particle *particles; particle_data *P; io_header header; int tot = snapshotLoader(argv[1], &header, &P); int k = 0; if(tot <= 0) { printf("error while loading snapshot file\n"); return -1; } initBBox2(&box); particles = (struct Particle*)malloc(header.npartTotal[1] * sizeof(struct Particle)); UINT* particleIds = (UINT*)malloc(header.npartTotal[1] * sizeof(UINT)); //for(int j = header.npartTotal[0]; j < header.npartTotal[0]+header.npartTotal[1]; ++j) #define F 1 for(int j = header.npartTotal[0]; j < header.npartTotal[0]+header.npartTotal[1]; j += F) { particles[k].pos.x = P[j].Pos[0]; particles[k].pos.y = P[j].Pos[1]; particles[k].pos.z = P[j].Pos[2]; particles[k].vel.x = P[j].Vel[0]; particles[k].vel.y = P[j].Vel[1]; particles[k].vel.z = P[j].Vel[2]; particles[k].mass = P[j].Mass; particles[k].id = P[j].Id; particleIds[k] = P[j].Id; //printf("%f %f %f\n", particles[k].pos.x, particles[k].pos.y, particles[k].pos.z); //get bbox /* if(particles[k].pos.x < box.box[0].x) box.box[0].x = particles[k].pos.x; if(particles[k].pos.y < box.box[0].y) box.box[0].y = particles[k].pos.y; if(particles[k].pos.z < box.box[0].z) box.box[0].z = particles[k].pos.z; if(particles[k].pos.x >= box.box[1].x) box.box[1].x = particles[k].pos.x; if(particles[k].pos.y >= box.box[1].y) box.box[1].y = particles[k].pos.y; if(particles[k].pos.z >= box.box[1].z) box.box[1].z = particles[k].pos.z; */ ++k; } free(P); header.npartTotal[1] /= F; struct Tree tree; tree.nodelist = (struct Node*)malloc(2*header.npartTotal[1] * sizeof(struct Node)); struct Particle* ref = (struct Particle*)malloc(sizeof(struct Particle) * header.npartTotal[1]); // init ocl icl_init_devices(DEVICE); if (icl_get_num_devices() != 0) { icl_device* dev = icl_get_device(0); icl_print_device_short_info(dev); icl_buffer* particlesD = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(struct Particle) * header.npartTotal[1]); // copy particles to ocl device icl_write_buffer(particlesD, CL_TRUE, sizeof(struct Particle) * header.npartTotal[1], &particles[0], NULL, NULL); run(1, 0.00001, 0.0025, particles, particlesD, header.npartTotal[1], dev, &tree, ref); // icl_buffer* kdTree = icl_create_buffer(dev, CL_MEM_READ_WRITE, sizeof(struct KdNode) * (header.npartTotal[1] * 2 - 1)); // buildTree(tree.nodelist, particlesD, kdTree, header.npartTotal[1], dev); // TODO particles have been resorted during tree construction. Upload them in original sorting for comparison, not needed for correctness REMOVE IT! // icl_write_buffer(particlesD, CL_TRUE, sizeof(struct Particle) * header.npartTotal[1], &particles[0], NULL, NULL); // walk(kdTree, particlesD, header.npartTotal[1], 0.00001, dev); // // read particles from device, used as reference for correctness check // icl_read_buffer(particlesD, CL_TRUE, sizeof(struct Particle) * header.npartTotal[1], &ref[0], NULL, NULL); // // printf("Walk second time with last acceleration of particles\n"); // // walk(kdTree, particlesD, header.npartTotal[1], 0.00001, dev, particles); // // // read particles from device icl_read_buffer(particlesD, CL_TRUE, sizeof(struct Particle) * header.npartTotal[1], &particles[0], NULL, NULL); // // icl_release_buffer(kdTree); icl_release_buffer(particlesD); icl_release_devices(); printf("\nSUCCESS\n"); } else { printf("ERROR! Cannot find requested device\n"); return -1; } // check_force("forcetest_1e5.txt", "result.txt", particles, particleIds, header.npartTotal[1]); // check_force("forcetest.txt", "result.txt", particles, particleIds, header.npartTotal[1]); #if timing == 1 check_force_internal("result.txt", ref, particles, particleIds, header.npartTotal[1]); #endif // display interactions, stored in each particle at acc.x /*FLOAT average = 0; for(UINT i = 0; i < header.npartTotal[1]; ++i) { average += particles[i].acc.x; } printf("Average number of interactions: %f\n", average/header.npartTotal[1]); */ free(tree.nodelist); free(particles); free(ref); return 0; }
int main(int argc, const char* argv[]) { icl_args* args = icl_init_args(); icl_parse_args(argc, argv, args); icl_print_args(args); chdir(PATH); // int dim = 128; int size = args->size; int nRef = 100000; float* ref = (float*)malloc(sizeof(float) * nRef /* dim*/); float* query = (float*)malloc(sizeof(float) * size /* dim*/); float* dists = (float *)malloc(sizeof(float) * size); int* neighbors = (int*)malloc(sizeof(int) * size); srand(42); for(int i=0; i < nRef/*dim*/; ++i) { ref[i] = rand(); } for(int i=0; i < size/**dim*/; ++i) { query[i] = rand(); } icl_init_devices(ICL_ALL); icl_start_energy_measurement(); if (icl_get_num_devices() != 0) { icl_device* dev = icl_get_device(0); icl_print_device_short_info(dev); icl_kernel* kernel = icl_create_kernel(dev, "knn.cl", "knn", "", ICL_SOURCE); size_t szLocalWorkSize = args->local_size; float multiplier = size/(float)szLocalWorkSize; if(multiplier > (int)multiplier) multiplier += 1; size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize; for (int i = 0; i < args->loop_iteration; ++i) { icl_buffer* buf_ref = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(float) * nRef /* dim*/); icl_buffer* buf_query = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(float) * size /* dim*/); icl_buffer* buf_dists = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(float) * size); icl_buffer* buf_neighbors = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(int) * size); icl_write_buffer(buf_ref, CL_TRUE, sizeof(float) * nRef /*dim*/, &ref[0], NULL, NULL); icl_write_buffer(buf_query, CL_TRUE, sizeof(float) * size /*dim*/, &query[0], NULL, NULL); icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 6, (size_t)0, (void *)buf_ref, (size_t)0, (void *)buf_query, (size_t)0, (void *)buf_dists, (size_t)0, (void *)buf_neighbors, sizeof(cl_int), (void *)&nRef, sizeof(cl_int), (void *)&size); // sizeof(cl_int), (void *)&dim); icl_read_buffer(buf_dists, CL_TRUE, sizeof(float) * size, &dists[0], NULL, NULL); icl_read_buffer(buf_neighbors, CL_TRUE, sizeof(int) * size, &neighbors[0], NULL, NULL); icl_release_buffers(4, buf_ref, buf_query, buf_dists, buf_neighbors); } icl_release_kernel(kernel); } icl_stop_energy_measurement(); if (args->check_result) { printf("======================\n= KNN program working\n"); unsigned int check = 1; unsigned int sum = 0; for(int i = 0; i < size; ++i) { if(dists[i] < 0) check = 0; if(neighbors[i] < 0 || neighbors[i] >= nRef) check = 0; } printf("======================\n"); printf("Result check: %s\n", check ? "OK" : "FAIL"); } else { printf("Result check: OK\n"); } icl_release_args(args); icl_release_devices(); free(ref); free(query); free(dists); free(neighbors); }
int main(int argc, const char* argv[]) { chdir(PATH); icl_args* args = icl_init_args(); icl_parse_args(argc, argv, args); icl_print_args(args); int size = args->size; // this is the size of chunking - so far as big as the local size int chunkSize = 16; cl_float16* input = (cl_float16*)malloc(sizeof(cl_float16) * size); float* output = (float*)malloc(sizeof(float) * size); fillrandom_float((float*)input,size, chunkSize, 0.001f ,100000.f); icl_init_devices(args->device_type); icl_start_energy_measurement(); if (icl_get_num_devices() != 0) { icl_device* dev = icl_get_device(args->device_id); icl_print_device_short_info(dev); icl_kernel* kernel = icl_create_kernel(dev, "reduction_chunking.cl", "reduce", "", ICL_SOURCE); size_t szLocalWorkSize = args->local_size; float multiplier = size/(float)szLocalWorkSize; if(multiplier > (int)multiplier) multiplier += 1; size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize; for (int i = 0; i < args->loop_iteration; ++i) { icl_buffer* buf_input = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(cl_float16) * size); icl_buffer* buf_output = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(float) * size); icl_write_buffer(buf_input, CL_FALSE, sizeof(cl_float16) * size, &input[0], NULL, NULL); icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, NULL, NULL, 4, (size_t)0, (void *)buf_input, (size_t)0, (void *)buf_output, sizeof(cl_int), (void *)&chunkSize, sizeof(cl_int), (void *)&size ); icl_read_buffer(buf_output, CL_TRUE, sizeof(float) * size, &output[0], NULL, NULL); icl_release_buffers(2, buf_input, buf_output); } icl_release_kernel(kernel); } icl_stop_energy_measurement(); // printf("Chunks' minimum \n"); // out_float_hbuffer(output, size); if (args->check_result) { printf("======================\n= Reduction test\n"); unsigned int check = 1; float host_min = 100000.f; float* testInput = (float*)input; for(unsigned int i = 0; i < size*chunkSize; ++i) if(testInput[i] < host_min) host_min = testInput[i]; printf("Host minimum is %f\n", host_min); float device_min = 100000.f; for(unsigned int i = 0; i < size; ++i) if(output[i] < device_min) device_min = output[i]; printf("Device minimum is %f\n", device_min); printf("Result check: %s\n", (device_min == host_min) ? "OK" : "FAIL"); } else { printf("Result check: OK\n"); } icl_release_devices(); free(input); free(output); #ifdef _MSC_VER icl_prompt(); #endif return 0; }
int main(int argc, char* argv[]) { int size = 1000; int* input1 = (int*)malloc(sizeof(int) * size); int* input2 = (int*) malloc(sizeof(int) * size); int* output = (int *)malloc(sizeof(int) * size); for(int i=0; i < size; ++i) { input1[i] = i; input2[i] = 1; } #ifndef INSIEME icl_timer* time1 = icl_init_timer(ICL_SEC); icl_start_timer(time1); #endif icl_init_devices(ICL_CPU); #ifndef INSIEME printf("TIME for initialization: %f\n", icl_stop_timer(time1)); #endif if (icl_get_num_devices() != 0) { icl_device* dev = icl_get_device(0); icl_print_device_short_info(dev); icl_kernel* kernel = icl_create_kernel(dev, "vec_mul.cl", "vec_mul", "", ICL_SOURCE); icl_buffer* buf_input1 = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(int) * size); icl_buffer* buf_input2 = icl_create_buffer(dev, CL_MEM_READ_ONLY, sizeof(int) * size); icl_buffer* buf_output = icl_create_buffer(dev, CL_MEM_WRITE_ONLY, sizeof(int) * size); icl_event* wb1 = icl_create_event(); icl_event* wb2 = icl_create_event(); icl_event* rb = icl_create_event(); icl_write_buffer(buf_input1, CL_FALSE, sizeof(int) * size, &input1[0], NULL, wb1); icl_write_buffer(buf_input2, CL_FALSE, sizeof(int) * size, &input2[0], NULL, wb2); size_t szLocalWorkSize = 256; float multiplier = size/(float)szLocalWorkSize; if(multiplier > (int)multiplier) multiplier += 1; size_t szGlobalWorkSize = (int)multiplier * szLocalWorkSize; icl_event* rk = icl_create_event(); icl_event* wb_all = icl_create_event_list(2, wb1, wb2); icl_run_kernel(kernel, 1, &szGlobalWorkSize, &szLocalWorkSize, wb_all, rk, 4, (size_t)0, (void *)buf_input1, (size_t)0, (void *)buf_input2, (size_t)0, (void *)buf_output, sizeof(cl_int), (void *)&size); icl_read_buffer(buf_output, CL_TRUE, sizeof(int) * size, &output[0], rk, rb); printf("Time wb1 %f\n", icl_profile_event(wb1, ICL_STARTED, ICL_FINISHED, ICL_SEC)); printf("Time wb2 %f\n", icl_profile_event(wb2, ICL_STARTED, ICL_FINISHED, ICL_SEC)); printf("Time rk %f\n", icl_profile_event(rk, ICL_STARTED, ICL_FINISHED, ICL_SEC)); printf("Time rb %f\n", icl_profile_event(rb, ICL_STARTED, ICL_FINISHED, ICL_SEC)); icl_release_events(5, wb1, wb2, wb_all, rk, rb); icl_release_buffers(3, buf_input1, buf_input2, buf_output); icl_release_kernel(kernel); } #ifndef INSIEME icl_restart_timer(time1); #endif icl_release_devices(); #ifndef INSIEME printf("TIME for releasing the devices: %f\n", icl_stop_timer(time1)); icl_release_timer(time1); #endif // CHECK for output printf("======================\n= Vector Mul Done\n"); unsigned int check = 1; for(unsigned int i = 0; i < size; ++i) { if(output[i] != i*size) { check = 0; printf("= fail at %d, expected %d / actual %d", i, i*3/2, output[i]); break; } } printf("= result check: %s\n======================\n", check ? "OK" : "FAIL"); free(input1); free(input2); free(output); }