void init_arrays(double target_time) { if (DEBUG) fprintf(stderr, "called init_arrays with target_time = %f \n", (target_time * 1e6)); #ifdef _ENABLE_CUDA_KERNEL_ if (options.target == gpu || options.target == both) { /* Setting size of arrays for Dummy Compute */ int N = options.device_array_size; /* Device Arrays for Dummy Compute */ allocate_device_arrays(N); double time_elapsed = 0.0; double t1 = 0.0, t2 = 0.0; while (1) { t1 = MPI_Wtime(); if (options.target == gpu || options.target == both) { cudaStreamCreate(&stream); call_kernel(A, d_x, d_y, N, &stream); cudaDeviceSynchronize(); cudaStreamDestroy(stream); } t2 = MPI_Wtime(); if ((t2-t1) < target_time) { N += 32; /* Now allocate arrays of size N */ allocate_device_arrays(N); } else { break; } } /* we reach here with desired N so save it and pass it to options */ options.device_array_size = N; if (DEBUG) fprintf(stderr, "correct N = %d\n", N); } #endif }
/* Print code for initializing the device for execution of the transformed * code. This includes declaring locally defined variables as well as * declaring and allocating the required copies of arrays on the device. */ static __isl_give isl_printer *init_device(__isl_take isl_printer *p, struct gpu_prog *prog) { p = print_cuda_macros(p); p = gpu_print_local_declarations(p, prog); p = declare_device_arrays(p, prog); p = allocate_device_arrays(p, prog); p = bind_device_textures_surfaces(p, prog); return p; }
void init_arrays(double target_time) { if (DEBUG) fprintf(stderr, "called init_arrays with target_time = %f \n", (target_time * 1e6)); int i = 0, j = 0; a = (float **)malloc(DIM * sizeof(float *)); for (i = 0; i < DIM; i++) { a[i] = (float *)malloc(DIM * sizeof(float)); } x = (float *)malloc(DIM * sizeof(float)); y = (float *)malloc(DIM * sizeof(float)); for (i = 0; i < DIM; i++) { x[i] = y[i] = 1.0f; for (j = 0; j < DIM; j++) { a[i][j] = 2.0f; } } #ifdef _ENABLE_CUDA_KERNEL_ if (options.target == gpu || options.target == both) { /* Setting size of arrays for Dummy Compute */ int N = options.device_array_size; /* Device Arrays for Dummy Compute */ allocate_device_arrays(N); double time_elapsed = 0.0; double t1 = 0.0, t2 = 0.0; while (1) { t1 = MPI_Wtime(); if (options.target == gpu || options.target == both) { cudaStreamCreate(&stream); call_kernel(A, d_x, d_y, N, &stream); cudaDeviceSynchronize(); cudaStreamDestroy(stream); } t2 = MPI_Wtime(); if ((t2-t1) < target_time) { N += 32; /* First free the old arrays */ free_device_arrays(); /* Now allocate arrays of size N */ allocate_device_arrays(N); } else { break; } } /* we reach here with desired N so save it and pass it to options */ options.device_array_size = N; if (DEBUG) fprintf(stderr, "correct N = %d\n", N); } #endif }