int main(void) { DATA_TYPE *a; DATA_TYPE *x1; DATA_TYPE *x2; DATA_TYPE *x1_outputFromGpu; DATA_TYPE *x2_outputFromGpu; DATA_TYPE *y_1; DATA_TYPE *y_2; ///////////////////////// size_t oldSizes[1] = { N }; size_t newSizes[1]; getNewSizes(oldSizes, NULL, newSizes, NULL, "mvt_kernel1", 1); N = newSizes[0]; ///////////////////////// a = (DATA_TYPE *)malloc(N * N * sizeof(DATA_TYPE)); x1 = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE)); x2 = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE)); x1_outputFromGpu = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE)); x2_outputFromGpu = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE)); y_1 = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE)); y_2 = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE)); init_arrays(a, x1, x2, y_1, y_2); platform = new Platform(PLATFORM_ID); context = platform->getContext(); Device device = platform->getDevice(DEVICE_ID); Queue queue(*context,device,Queue::EnableProfiling); cl_mem_init(a, x1, x2, y_1, y_2,queue); Program program(context,KERNEL_DIRECTORY KERNEL_FILE_NAME); if(!program.build(device)){ std::cout << "Error building the program: \n"; std::cout <<program.getBuildLog(device); } kernel1=program.createKernel(kernel1Name.c_str()); kernel2=program.createKernel(kernel2Name.c_str()); cl_launch_kernel(queue); queue.readBuffer(*x1_mem_obj,N * sizeof(DATA_TYPE), x1_outputFromGpu); queue.readBuffer(*x2_mem_obj,N * sizeof(DATA_TYPE), x2_outputFromGpu); queue.finish(); runMvt(a, x1, x2, y_1, y_2, x1_outputFromGpu,x2_outputFromGpu); cl_clean_up(); free(a); free(x1); free(x2); free(x1_outputFromGpu); free(x2_outputFromGpu); free(y_1); free(y_2); return 0; }
int main() { double t_start, t_end; DATA_TYPE a[N][N]; DATA_TYPE x1[N]; DATA_TYPE x1_outputFromGpu[N]; DATA_TYPE x2[N]; DATA_TYPE x2_outputFromGpu[N]; DATA_TYPE y1[N]; DATA_TYPE y2[N]; //initialize the arrays for running on the CPU and GPU init_array(a, x1, x1_outputFromGpu, x2, x2_outputFromGpu, y1, y2); #pragma hmpp mvt allocate #pragma hmpp mvt advancedload, args[a,x1,x2,y1,y2] t_start = rtclock(); //run the algorithm on the GPU #pragma hmpp mvt callsite, args[x1,x2].advancedload=true, asynchronous runMvt(a, x1_outputFromGpu, x2_outputFromGpu, y1, y2); // parameters are initialized in decls.h and are initialized with init_array() #pragma hmpp mvt synchronize t_end = rtclock(); fprintf(stderr, "GPU Runtime: %0.6lf\n", t_end - t_start); #pragma hmpp mvt delegatedstore, args[x1,x2] #pragma hmpp mvt release t_start = rtclock(); //run the algorithm on the CPU runMvt(a, x1, x2, y1, y2); t_end = rtclock(); fprintf(stderr, "CPU Runtime: %0.6lf\n", t_end - t_start); compareResults(x1, x1_outputFromGpu, x2, x2_outputFromGpu); return 0; }
int main(void) { #ifdef ALOCACAO_NORMAL printf(">>>>>>>>>Versao Offload<<<<<<<<<<<\n"); #else printf(">>>>>>>>>Versao Memoria Compartilhada<<<<<<<<<<<\n"); #endif double t_start, t_end; double t_start_init, t_end_init; double t_start_init_off, t_end_init_off; double t_offload_start, t_offload_end; double total_kernel; int i; #ifdef ALOCACAO_NORMAL a = (DATA_TYPE*)malloc(N*N*sizeof(DATA_TYPE)); x1_outputFromGpu = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE)); x2_outputFromGpu = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE)); y_1 = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE)); y_2 = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE)); #endif x1 = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE)); x2 = (DATA_TYPE*)malloc(N*sizeof(DATA_TYPE)); t_start_init = rtclock(); read_cl_file(); t_end_init = rtclock(); tmp_read_cl_file = t_end_init - t_start_init; total_kernel = t_end_init - t_start_init; t_start_init = rtclock(); #ifndef MALI cl_initialization(); #else cl_initialization_Mali(); #endif t_end_init = rtclock(); tmp_cl_initialization = t_end_init - t_start_init; total_kernel = t_end_init - t_start_init; t_start_init = rtclock(); cl_mem_init(); t_end_init = rtclock(); tmp_cl_mem_init= t_end_init - t_start_init; total_kernel += t_end_init - t_start_init; //Está dentro da função a contagem init(); //------------GPU--------------- //Inicia tempo GPU #ifdef ALOCACAO_NORMAL t_start_init_off = rtclock(); errcode = clEnqueueWriteBuffer(clCommandQue, a_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * N * N, a, 0, NULL, NULL); errcode = clEnqueueWriteBuffer(clCommandQue, x1_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * N, x1, 0, NULL, NULL); errcode = clEnqueueWriteBuffer(clCommandQue, x2_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * N, x2, 0, NULL, NULL); errcode = clEnqueueWriteBuffer(clCommandQue, y1_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * N, y_1, 0, NULL, NULL); errcode = clEnqueueWriteBuffer(clCommandQue, y2_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * N, y_2, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in writing buffers\n"); t_end_init_off = rtclock(); tmp_clEnqueueWriteBuffer += t_end_init_off - t_start_init_off; total_kernel += t_end_init_off - t_start_init_off; #endif t_start_init = rtclock(); cl_load_prog(); t_end_init = rtclock(); tmp_cl_load_prog= t_end_init - t_start_init; total_kernel += t_end_init - t_start_init; t_start_init = rtclock(); cl_launch_kernel(); t_end_init = rtclock(); tmp_cl_launch_kernel += t_end_init - t_start_init; total_kernel += t_end_init - t_start_init; #ifdef ALOCACAO_NORMAL t_start_init_off = rtclock(); errcode = clEnqueueReadBuffer(clCommandQue, x1_mem_obj, CL_TRUE, 0, N*sizeof(DATA_TYPE), x1_outputFromGpu, 0, NULL, NULL); errcode = clEnqueueReadBuffer(clCommandQue, x2_mem_obj, CL_TRUE, 0, N*sizeof(DATA_TYPE), x2_outputFromGpu, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); t_end_init_off = rtclock(); tmp_clEnqueueReadBuffer += t_end_init_off - t_start_init_off; total_kernel += t_end_init_off - t_start_init_off; #endif //--------------CPU------------------ t_start = rtclock(); runMvt(); t_end = rtclock(); tmp_serial = t_end - t_start; compareResults(x1, x1_outputFromGpu, x2, x2_outputFromGpu); t_start_init_off = rtclock(); cl_clean_up(); t_end_init_off = rtclock(); tmp_cl_clean_up+=t_end_init_off - t_start_init_off; total_kernel += t_end_init_off - t_start_init_off; free(x1); free(x2); #ifdef ALOCACAO_NORMAL free(a); free(x1_outputFromGpu); free(x2_outputFromGpu); free(y_1); free(y_2); #endif printf("\n-------RESULTS-------\n"); printf("Sizes N=%d\n\n", N); printf("read_cl_file -------------> %lf\n", tmp_read_cl_file); printf("cl_initialization --------> %lf\n", tmp_cl_initialization); printf("cl_mem_init --------------> %lf\n", tmp_cl_mem_init); printf("init ---------------------> %lf\n", tmp_init); printf("cl_load_prog -------------> %lf\n", tmp_cl_load_prog); printf("cl_launch_kernel ---------> %lf\n", tmp_cl_launch_kernel); printf("serialExecution ----------> %lf\n", tmp_serial); printf("cl_clean_up --------------> %lf\n", tmp_cl_clean_up); printf("clEnqueueWriteBuffer -----> %lf\n", tmp_clEnqueueWriteBuffer); printf("clEnqueueReadBuffer-------> %lf\n", tmp_clEnqueueReadBuffer); printf("clEnqueueMapBuffer -------> %lf\n", tmp_clEnqueueMapBuffer); printf("clEnqueueUnmapMemObject --> %lf\n", tmp_clEnqueueUnmapMemObject); return 0; }