示例#1
0
void
init_arrays(double target_time)
{

    if (DEBUG) fprintf(stderr, "called init_arrays with target_time = %f \n",
            (target_time * 1e6));

#ifdef _ENABLE_CUDA_KERNEL_
    if (options.target == gpu || options.target == both) {
    /* Setting size of arrays for Dummy Compute */
    int N = options.device_array_size;

    /* Device Arrays for Dummy Compute */
    allocate_device_arrays(N);
    
    double time_elapsed = 0.0;
    double t1 = 0.0, t2 = 0.0;
    
    while (1) {
        t1 = MPI_Wtime();
        
        if (options.target == gpu || options.target == both) {
            cudaStreamCreate(&stream);
            call_kernel(A, d_x, d_y, N, &stream);
            
            cudaDeviceSynchronize();
            cudaStreamDestroy(stream);
        }

        t2 = MPI_Wtime();
        if ((t2-t1) < target_time)
        {  
            N += 32;

            /* Now allocate arrays of size N */
            allocate_device_arrays(N);
        }
        else {
            break;
        }
    }
    
    /* we reach here with desired N so save it and pass it to options */
    options.device_array_size = N;
    if (DEBUG) fprintf(stderr, "correct N = %d\n", N);
    }
#endif

}
示例#2
0
/* Print code for initializing the device for execution of the transformed
 * code.  This includes declaring locally defined variables as well as
 * declaring and allocating the required copies of arrays on the device.
 */
static __isl_give isl_printer *init_device(__isl_take isl_printer *p,
	struct gpu_prog *prog)
{
	p = print_cuda_macros(p);

	p = gpu_print_local_declarations(p, prog);
	p = declare_device_arrays(p, prog);
	p = allocate_device_arrays(p, prog);
	p = bind_device_textures_surfaces(p, prog);

	return p;
}
示例#3
0
void 
init_arrays(double target_time) 
{
    
    if (DEBUG) fprintf(stderr, "called init_arrays with target_time = %f \n", (target_time * 1e6));
    int i = 0, j = 0;
    
    a = (float **)malloc(DIM * sizeof(float *));
    
    for (i = 0; i < DIM; i++) {
        a[i] = (float *)malloc(DIM * sizeof(float));
    }
    
    x = (float *)malloc(DIM * sizeof(float));
    y = (float *)malloc(DIM * sizeof(float));

    for (i = 0; i < DIM; i++) {
        x[i] = y[i] = 1.0f;
        for (j = 0; j < DIM; j++) {
            a[i][j] = 2.0f;
        }
    }

#ifdef _ENABLE_CUDA_KERNEL_
    if (options.target == gpu || options.target == both) {
    /* Setting size of arrays for Dummy Compute */
    int N = options.device_array_size;

    /* Device Arrays for Dummy Compute */
    allocate_device_arrays(N);
    
    double time_elapsed = 0.0;
    double t1 = 0.0, t2 = 0.0;
    
    while (1) {
        t1 = MPI_Wtime();
        
        if (options.target == gpu || options.target == both) {
            cudaStreamCreate(&stream);
            call_kernel(A, d_x, d_y, N, &stream);
            
            cudaDeviceSynchronize();
            cudaStreamDestroy(stream);
        }

        t2 = MPI_Wtime();
        if ((t2-t1) < target_time)
        {  
            N += 32;

            /* First free the old arrays */
            free_device_arrays();

            /* Now allocate arrays of size N */
            allocate_device_arrays(N);
        }
        else {
            break;
        }
    }
    
    /* we reach here with desired N so save it and pass it to options */
    options.device_array_size = N;
    if (DEBUG) fprintf(stderr, "correct N = %d\n", N);
    }
#endif

}