Пример #1
0
extern "C" void mixbenchGPU(cl_device_id dev_id, double *c, long size, bool block_strided, bool host_allocated, size_t workgroupsize, unsigned int elements_per_wi, unsigned int fusion_degree) {
    const char *benchtype;
    if(block_strided)
        benchtype = "Workgroup";
    else
        benchtype = "NDRange";
    printf("Workitem stride:        %s\n", benchtype);
    const char *buffer_allocation = host_allocated ? "Host allocated" : "Device allocated";
    printf("Buffer allocation:      %s\n", buffer_allocation);

    // Set context properties
    cl_platform_id p_id;
    OCL_SAFE_CALL( clGetDeviceInfo(dev_id, CL_DEVICE_PLATFORM, sizeof(p_id), &p_id, NULL) );
    size_t length;
    OCL_SAFE_CALL( clGetDeviceInfo(dev_id, CL_DEVICE_EXTENSIONS, 0, NULL, &length) );
    char *extensions = (char*)alloca(length);
    OCL_SAFE_CALL( clGetDeviceInfo(dev_id, CL_DEVICE_EXTENSIONS, length, extensions, NULL) );
    bool enable_dp = strstr(extensions, "cl_khr_fp64") != NULL;

    cl_context_properties ctxProps[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)p_id, 0 };

    cl_int errno;
    // Create context
    cl_context context = clCreateContext(ctxProps, 1, &dev_id, NULL, NULL, &errno);
    OCL_SAFE_CALL(errno);

    cl_mem_flags buf_flags = CL_MEM_READ_WRITE;
    if( host_allocated )
        buf_flags |= CL_MEM_ALLOC_HOST_PTR;
    cl_mem c_buffer = clCreateBuffer(context, buf_flags, size*sizeof(double), NULL, &errno);
    OCL_SAFE_CALL(errno);

    // Create command queue
    cl_command_queue cmd_queue = clCreateCommandQueue(context, dev_id, CL_QUEUE_PROFILING_ENABLE, &errno);
    OCL_SAFE_CALL(errno);

    // Set data on device memory
    cl_int *mapped_data = (cl_int*)clEnqueueMapBuffer(cmd_queue, c_buffer, CL_TRUE, CL_MAP_WRITE, 0, size*sizeof(double), 0, NULL, NULL, &errno);
    OCL_SAFE_CALL(errno);
    for(int i=0; i<size; i++)
        mapped_data[i] = 0;
    clEnqueueUnmapMemObject(cmd_queue, c_buffer, mapped_data, 0, NULL, NULL);

    // Load source, create program and all kernels
    printf("Loading kernel source file...\n");
    const char c_param_format_str[] = "-cl-std=CL1.1 -cl-mad-enable -Dclass_T=%s -Dblockdim=" SIZE_T_FORMAT " -DCOMPUTE_ITERATIONS=%d -DELEMENTS_PER_THREAD=%d -DFUSION_DEGREE=%d %s %s";
    const char *c_empty = "";
    const char *c_striding = block_strided ? "-DBLOCK_STRIDED" : c_empty;
    const char *c_enable_dp = "-DENABLE_DP";
    char c_build_params[256];
    const char *c_kernel_source = {ReadFile("mix_kernels_ro.cl")};
    printf("Precompilation of kernels... ");
    sprintf(c_build_params, c_param_format_str, "short", workgroupsize, 0, 1, 1, c_striding, c_empty);

    cl_kernel kernel_warmup = BuildKernel(context, dev_id, c_kernel_source, c_build_params);

    show_progress_init(compute_iterations_len);
    cl_kernel kernels[kdt_double+1][compute_iterations_len];
    for(int i=0; i<compute_iterations_len; i++) {
        show_progress_step(0, '\\');
        sprintf(c_build_params, c_param_format_str, "float", workgroupsize, compute_iterations[i], elements_per_wi, fusion_degree, c_striding, c_empty);
        //printf("%s\n",c_build_params);
        kernels[kdt_float][i] = BuildKernel(context, dev_id, c_kernel_source, c_build_params);

        show_progress_step(0, '|');
        sprintf(c_build_params, c_param_format_str, "int", workgroupsize, compute_iterations[i], elements_per_wi, fusion_degree, c_striding, c_empty);
        //printf("%s\n",c_build_params);
        kernels[kdt_int][i] = BuildKernel(context, dev_id, c_kernel_source, c_build_params);

        if( enable_dp ) {
            show_progress_step(0, '/');
            sprintf(c_build_params, c_param_format_str, "double", workgroupsize, compute_iterations[i], elements_per_wi, fusion_degree, c_striding, c_enable_dp);
            //printf("%s\n",c_build_params);
            kernels[kdt_double][i] = BuildKernel(context, dev_id, c_kernel_source, c_build_params);
        } else
            kernels[kdt_double][i] = 0;
        show_progress_step(1, '>');
    }
    show_progress_done();
    free((char*)c_kernel_source);

    runbench_warmup(cmd_queue, kernel_warmup, c_buffer, size, workgroupsize);

    // Synchronize in order to wait for memory operations to finish
    OCL_SAFE_CALL( clFinish(cmd_queue) );

    printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n");
    printf("Experiment ID, Single Precision ops,,,,              Double precision ops,,,,              Integer operations,,, \n");
    printf("Compute iters, Flops/byte, ex.time,  GFLOPS, GB/sec, Flops/byte, ex.time,  GFLOPS, GB/sec, Iops/byte, ex.time,   GIOPS, GB/sec\n");

    for(int i=0; i<compute_iterations_len; i++)
        runbench(compute_iterations, i, cmd_queue, kernels, c_buffer, size, workgroupsize, elements_per_wi, fusion_degree);

    printf("------------------------------------------------------------------------------------------------------------------------------\n");

    // Copy results back to host memory
    OCL_SAFE_CALL( clEnqueueReadBuffer(cmd_queue, c_buffer, CL_TRUE, 0, size*sizeof(double), c, 0, NULL, NULL) );

    // Release kernels and program
    ReleaseKernelNProgram(kernel_warmup);
    for(int i=0; i<compute_iterations_len; i++) {
        ReleaseKernelNProgram(kernels[kdt_float][i]);
        ReleaseKernelNProgram(kernels[kdt_int][i]);
        if( enable_dp )
            ReleaseKernelNProgram(kernels[kdt_double][i]);
    }

    // Release buffer
    OCL_SAFE_CALL( clReleaseMemObject(c_buffer) );
}
Пример #2
0
extern "C" void mixbenchGPU(double *c, long size){
	const char *benchtype = "compute with global memory (block strided)";
	printf("Trade-off type:       %s\n", benchtype);
	double *cd;

	CUDA_SAFE_CALL( hipMalloc((void**)&cd, size*sizeof(double)) );

	// Copy data to device memory
	CUDA_SAFE_CALL( hipMemset(cd, 0, size*sizeof(double)) );  // initialize to zeros

	// Synchronize in order to wait for memory operations to finish
	CUDA_SAFE_CALL( hipDeviceSynchronize() );

	printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n");
	printf("Experiment ID, Single Precision ops,,,,              Double precision ops,,,,              Integer operations,,, \n");
	printf("Compute iters, Flops/byte, ex.time,  GFLOPS, GB/sec, Flops/byte, ex.time,  GFLOPS, GB/sec, Iops/byte, ex.time,   GIOPS, GB/sec\n");

	runbench_warmup(cd, size);

	runbench<32>(cd, size);
	runbench<31>(cd, size);
	runbench<30>(cd, size);
	runbench<29>(cd, size);
	runbench<28>(cd, size);
	runbench<27>(cd, size);
	runbench<26>(cd, size);
	runbench<25>(cd, size);
	runbench<24>(cd, size);
	runbench<23>(cd, size);
	runbench<22>(cd, size);
	runbench<21>(cd, size);
	runbench<20>(cd, size);
	runbench<19>(cd, size);
	runbench<18>(cd, size);
	runbench<17>(cd, size);
	runbench<16>(cd, size);
	runbench<15>(cd, size);
	runbench<14>(cd, size);
	runbench<13>(cd, size);
	runbench<12>(cd, size);
	runbench<11>(cd, size);
	runbench<10>(cd, size);
	runbench<9>(cd, size);
	runbench<8>(cd, size);
	runbench<7>(cd, size);
	runbench<6>(cd, size);
	runbench<5>(cd, size);
	runbench<4>(cd, size);
	runbench<3>(cd, size);
	runbench<2>(cd, size);
	runbench<1>(cd, size);
	runbench<0>(cd, size);

	printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n");

	// Copy results back to host memory
	CUDA_SAFE_CALL( hipMemcpy(c, cd, size*sizeof(double), hipMemcpyDeviceToHost) );

	CUDA_SAFE_CALL( hipFree(cd) );

	CUDA_SAFE_CALL( hipDeviceReset() );
}