void free_buffer (void * buffer, enum accel_type type) { switch (type) { case none: free(buffer); break; case managed: case cuda: #ifdef _ENABLE_CUDA_ cudaFree(buffer); #endif break; case openacc: #ifdef _ENABLE_OPENACC_ acc_free(buffer); #endif break; } /* Free dummy compute related resources */ if (is_alloc) { if (options.target == cpu) { free_host_arrays(); } #ifdef _ENABLE_CUDA_KERNEL_ else if (options.target == gpu || options.target == both) { free_host_arrays(); free_device_arrays(); } #endif } is_alloc = 0; }
void free_buffer (void * buffer, enum accel_type type) { switch (type) { case none: free(buffer); break; case managed: case cuda: #ifdef _ENABLE_CUDA_ cudaFree(buffer); #endif break; case openacc: #ifdef _ENABLE_OPENACC_ acc_free(buffer); #endif break; } /* Free dummy compute related resources */ if (cpu == options.target || both == options.target) { free_host_arrays(); } if (gpu == options.target || both == options.target) { #ifdef _ENABLE_CUDA_KERNEL_ free_device_arrays(); #endif /* #ifdef _ENABLE_CUDA_KERNEL_ */ } }
/* Print code for clearing the device after execution of the transformed code. * In particular, free the memory that was allocated on the device. */ static __isl_give isl_printer *clear_device(__isl_take isl_printer *p, struct gpu_prog *prog) { p = unbind_device_textures_surfaces(p, prog); p = free_cuda_array(p,prog); p = free_device_arrays(p, prog); return p; }
void allocate_device_arrays(int n) { cudaError_t cuerr = cudaSuccess; /* First free the old arrays */ free_device_arrays(); /* Allocate Device Arrays for Dummy Compute */ cuerr = cudaMalloc((void**)&d_x, n * sizeof(float)); if (cuerr != cudaSuccess) { fprintf(stderr, "Failed to free device array"); } cuerr = cudaMalloc((void**)&d_y, n * sizeof(float)); if (cuerr != cudaSuccess) { fprintf(stderr, "Failed to free device array"); } cudaMemset(d_x, 1.0f, n); cudaMemset(d_y, 2.0f, n); is_alloc = 1; }
void init_arrays(double target_time) { if (DEBUG) fprintf(stderr, "called init_arrays with target_time = %f \n", (target_time * 1e6)); int i = 0, j = 0; a = (float **)malloc(DIM * sizeof(float *)); for (i = 0; i < DIM; i++) { a[i] = (float *)malloc(DIM * sizeof(float)); } x = (float *)malloc(DIM * sizeof(float)); y = (float *)malloc(DIM * sizeof(float)); for (i = 0; i < DIM; i++) { x[i] = y[i] = 1.0f; for (j = 0; j < DIM; j++) { a[i][j] = 2.0f; } } #ifdef _ENABLE_CUDA_KERNEL_ if (options.target == gpu || options.target == both) { /* Setting size of arrays for Dummy Compute */ int N = options.device_array_size; /* Device Arrays for Dummy Compute */ allocate_device_arrays(N); double time_elapsed = 0.0; double t1 = 0.0, t2 = 0.0; while (1) { t1 = MPI_Wtime(); if (options.target == gpu || options.target == both) { cudaStreamCreate(&stream); call_kernel(A, d_x, d_y, N, &stream); cudaDeviceSynchronize(); cudaStreamDestroy(stream); } t2 = MPI_Wtime(); if ((t2-t1) < target_time) { N += 32; /* First free the old arrays */ free_device_arrays(); /* Now allocate arrays of size N */ allocate_device_arrays(N); } else { break; } } /* we reach here with desired N so save it and pass it to options */ options.device_array_size = N; if (DEBUG) fprintf(stderr, "correct N = %d\n", N); } #endif }