int main(int argc, char **argv) { cl_uint num; cl_int err; int platform_idx = -1; cl_platform_id *plat_ids; int i; size_t sz; cl_device_id *gpu_devs; cl_context_properties cps[3]; cl_context context; int opt; char *input; int run_size = 1024; struct AIISA_Program prog; cl_command_queue queue; int ei; int nloop = 16; struct AIISA_CodeBuffer buf; aiisa_code_buffer_init(&buf); clGetPlatformIDs(0, NULL, &num); plat_ids = (cl_platform_id*)malloc(sizeof(*plat_ids) * num); clGetPlatformIDs(num, plat_ids, NULL); while ((opt = getopt(argc, argv, "n:")) != -1) { switch (opt) { case 'n': run_size = atoi(optarg); break; default: puts("usage : run in.cl"); return 1; } } if (optind >= argc) { puts("usage : run in.cl"); return 1; } input = argv[optind]; for (i=0; i<(int)num; i++) { char name[1024]; size_t len; clGetPlatformInfo(plat_ids[i], CL_PLATFORM_VENDOR, sizeof(name), name, &len); //puts(name); if (strcmp(name, "Advanced Micro Devices, Inc.") == 0) { platform_idx = i; break; } } if (platform_idx == -1) { puts("no amd"); return -1; } clGetDeviceIDs(plat_ids[platform_idx], CL_DEVICE_TYPE_GPU, 0, NULL, &num); if (num == 0) { puts("no gpu"); return -1; } gpu_devs = (cl_device_id*)malloc(sizeof(gpu_devs[0]) * 1); //clGetDeviceIDs(plat_ids[platform_idx], CL_DEVICE_TYPE_GPU, num, gpu_devs, NULL); cps[0] = CL_CONTEXT_PLATFORM; cps[1] = (cl_context_properties)plat_ids[platform_idx]; cps[2] = 0; context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &err); clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(gpu_devs), gpu_devs, &sz); queue = clCreateCommandQueue(context, gpu_devs[0], 0, NULL); { char name[1024]; size_t sz; clGetDeviceInfo(gpu_devs[0], CL_DEVICE_NAME, sizeof(name), name, &sz); puts(name); } //puts(input); aiisa_build_binary_from_cl(&prog, context, gpu_devs[0], input); for (ei=0; ei<nloop; ei++) { cl_program cl_prog; const unsigned char *bin[1]; size_t bin_size[1]; cl_kernel ker; cl_mem in, out; size_t global_size[3]; double tb, te; tb = sec(); gen_code(&prog, &buf); bin[0] = prog.cl_binary; bin_size[0] = prog.size; cl_prog = clCreateProgramWithBinary(context, 1, gpu_devs, bin_size, bin, NULL, NULL); clBuildProgram(cl_prog, 1, gpu_devs, NULL, NULL, NULL); ker = clCreateKernel(cl_prog, "f", &err); te = sec(); printf("build : %f[usec]\n", (te-tb)*1000000); in = clCreateBuffer(context, CL_MEM_READ_WRITE, run_size * sizeof(int), NULL, &err); out = clCreateBuffer(context, CL_MEM_READ_WRITE, run_size * sizeof(int), NULL, &err); clSetKernelArg(ker, 0, sizeof(cl_mem), &in); clSetKernelArg(ker, 1, sizeof(cl_mem), &out); { int *ptr = (int*)clEnqueueMapBuffer(queue, in, CL_TRUE, CL_MAP_WRITE, 0, run_size*sizeof(int), 0, NULL, NULL, NULL); int i; for (i=0; i<run_size; i++) { ptr[i] = i; } clEnqueueUnmapMemObject(queue, in, ptr, 0, NULL, NULL); } { int *ptr = (int*)clEnqueueMapBuffer(queue, out, CL_TRUE, CL_MAP_WRITE, 0, run_size*sizeof(int), 0, NULL, NULL, NULL); int i; for (i=0; i<run_size; i++) { ptr[i] = 0xdeadbeef; } clEnqueueUnmapMemObject(queue, out, ptr, 0, NULL, NULL); } err = clFinish(queue); global_size[0] = run_size; err = clEnqueueNDRangeKernel(queue, ker, 1, NULL, global_size, NULL, 0, NULL, NULL); if (err != CL_SUCCESS) { puts("enqueue nd"); } err = clFinish(queue); if (err != CL_SUCCESS) { puts("fini"); } if (ei == 0) { int *ptr = (int*)clEnqueueMapBuffer(queue, out, CL_TRUE, CL_MAP_READ, 0, run_size*sizeof(int), 0, NULL, NULL, NULL); int i; for (i=0; i<run_size; i++) { printf("%d : %x\n", i, ptr[i]); } clEnqueueUnmapMemObject(queue, in, ptr, 0, NULL, NULL); } err = clFinish(queue); clReleaseMemObject(in); clReleaseMemObject(out); clReleaseKernel(ker); clReleaseProgram(cl_prog); } return 0; }
void cape::load(int team) { saved_team = team; hit_floor = false; death_height_offset = 0.f; model = cpu_context->make_new(); model->set_load_func(std::bind(cape::load_cape, std::placeholders::_1, team)); model->set_active(true); model->cache = false; ///why? //model->set_normal("res/norm_body.png"); //obj_mem_manager::load_active_objects(); cpu_context->load_active(); model->set_two_sided(true); model->set_specular(0.7); //obj_mem_manager::g_arrange_mem(); //obj_mem_manager::g_changeover(); cpu_context->build(); gpu_context = cpu_context->fetch(); which = 0; in = compute::buffer(cl::context, sizeof(float)*width*height*depth*3, CL_MEM_READ_WRITE, nullptr); out = compute::buffer(cl::context, sizeof(float)*width*height*depth*3, CL_MEM_READ_WRITE, nullptr); cl_float* inmap = (cl_float*) clEnqueueMapBuffer(cl::cqueue.get(), in.get(), CL_TRUE, CL_MAP_WRITE, 0, sizeof(cl_float)*width*height*depth*3, 0, NULL, NULL, NULL); cl_float* outmap = (cl_float*) clEnqueueMapBuffer(cl::cqueue.get(), out.get(), CL_TRUE, CL_MAP_WRITE, 0, sizeof(cl_float)*width*height*depth*3, 0, NULL, NULL, NULL); const float separation = 10.f; for(int j=0; j<height; j++) { for(int i=0; i<width; i++) { float xpos = i * separation; float ypos = j * separation; float zpos = 0; inmap[(i + j*width)*3 + 0] = xpos; inmap[(i + j*width)*3 + 1] = ypos; inmap[(i + j*width)*3 + 2] = zpos; outmap[(i + j*width)*3 + 0] = xpos; outmap[(i + j*width)*3 + 1] = ypos; outmap[(i + j*width)*3 + 2] = zpos; } } clEnqueueUnmapMemObject(cl::cqueue.get(), in.get(), inmap, 0, NULL, NULL); clEnqueueUnmapMemObject(cl::cqueue.get(), out.get(), outmap, 0, NULL, NULL); loaded = true; context_id = cpu_context->get_context_id(); }
int main(int argc, char *argv[]) { int myid, numprocs, i, j; int size, align_size; // host buffer char *s_buf, *r_buf, *s_buf1, *r_buf1; double t_start = 0.0, t_end = 0.0, t = 0.0; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &myid); align_size = getpagesize(); assert(align_size <= MAX_ALIGNMENT); #ifdef PINNED // Get platform and device information cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); err_status(ret); ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices); err_status(ret); printf("%d device(s) in %d platform(s)\n",ret_num_devices, ret_num_platforms); char cBuffer[1024]; ret = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL); err_status(ret); printf("CL_DEVICE_NAME: %s\n", cBuffer); // Create an OpenCL context cl_context context = clCreateContext (NULL, 1, &device_id, NULL, NULL, &ret); err_status(ret); // Create a command queue cl_command_queue command_queue = clCreateCommandQueue (context, device_id, 0, &ret); err_status(ret); // Create memory buffers on the device cl_mem s_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, // CL_MEM_COPY_HOST_PTR is only valid with non-NULL pointer MYBUFSIZE, NULL, &ret); err_status(ret); cl_mem r_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, // CL_MEM_COPY_HOST_PTR is only valid with non-NULL pointer MYBUFSIZE, NULL, &ret); err_status(ret); // pinned memory (blocked call) s_buf1 = (char *) clEnqueueMapBuffer(command_queue, s_mem, CL_TRUE, CL_MAP_WRITE, 0, MYBUFSIZE, 0, NULL, NULL, &ret); err_status(ret); r_buf1 = (char *) clEnqueueMapBuffer(command_queue, r_mem, CL_TRUE, CL_MAP_WRITE, 0, MYBUFSIZE, 0, NULL, NULL, &ret); err_status(ret); #else if (myid == 0) printf("# Using PAGEABLE host memory!\n"); s_buf1 = (char*) malloc(MYBUFSIZE); r_buf1 = (char*) malloc(MYBUFSIZE); #endif s_buf = (char *) (((unsigned long) s_buf1 + (align_size - 1)) / align_size * align_size); r_buf = (char *) (((unsigned long) r_buf1 + (align_size - 1)) / align_size * align_size); if(numprocs != 2) { if(myid == 0) { fprintf(stderr, "This test requires exactly two processes\n"); } MPI_Finalize(); return EXIT_FAILURE; } if(myid == 0) { fprintf(stdout, "# %s\n", BENCHMARK); fprintf(stdout, "%-*s%*s\n", 10, "# Size", FIELD_WIDTH, "Bandwidth (MB/s)"); fflush(stdout); } /* Bandwidth test */ for(size = 1; size <= MAX_MSG_SIZE; size *= 2) { /* touch the data */ for(i = 0; i < size; i++) { s_buf[i] = 'a'; r_buf[i] = 'b'; } // puts("2"); if(size > large_message_size) { loop = loop_large; skip = skip_large; window_size = window_size_large; } if(myid == 0) { for(i = 0; i < loop + skip; i++) { if(i == skip) { t_start = MPI_Wtime(); } for(j = 0; j < window_size; j++) { MPI_Isend(s_buf, size, MPI_CHAR, 1, 100, MPI_COMM_WORLD, request + j); } MPI_Waitall(window_size, request, reqstat); MPI_Recv(r_buf, 4, MPI_CHAR, 1, 101, MPI_COMM_WORLD, &reqstat[0]); } t_end = MPI_Wtime(); // printf("%d %d\n",myid,size); t = t_end - t_start; } else if(myid == 1) { for(i = 0; i < loop + skip; i++) { for(j = 0; j < window_size; j++) { MPI_Irecv(r_buf, size, MPI_CHAR, 0, 100, MPI_COMM_WORLD, request + j); } MPI_Waitall(window_size, request, reqstat); MPI_Send(s_buf, 4, MPI_CHAR, 0, 101, MPI_COMM_WORLD); } // printf("%d %d\n",myid,size); } if(myid == 0) { double tmp = size / 1e6 * loop * window_size; fprintf(stdout, "%-*d%*.*f\n", 10, size, FIELD_WIDTH, FLOAT_PRECISION, tmp / t); fflush(stdout); } } #ifdef PINNED // cudaFree(s_buf1); // cudaFree(r_buf1); // clReleaseMemObject(s_mem); // clReleaseMemObject(r_mem); #else free(s_buf1); free(r_buf1); #endif MPI_Finalize(); return EXIT_SUCCESS; }
extern "C" magma_err_t magma_dpotrf_msub(int num_subs, int num_gpus, magma_uplo_t uplo, magma_int_t n, magmaDouble_ptr *d_lA, size_t dA_offset, magma_int_t ldda, magma_int_t *info, magma_queue_t *queues) { /* -- clMAGMA (version 1.1.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2014 Purpose ======= DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. The factorization has the form dA = U**T * U, if UPLO = 'U', or dA = L * L**T, if UPLO = 'L', where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments ========= UPLO (input) CHARACTER*1 = 'U': Upper triangle of dA is stored; = 'L': Lower triangle of dA is stored. N (input) INTEGER The order of the matrix dA. N >= 0. dA (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N) On entry, the symmetric matrix dA. If UPLO = 'U', the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = 'L', the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**T * U or dA = L * L**T. LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be dividable by 16. INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. ===================================================================== */ int tot_subs = num_subs * num_gpus; magma_err_t err; magma_int_t j, nb, d, lddp, h; double *work; magmaDouble_ptr dwork[MagmaMaxGPUs]; *info = 0; nb = magma_get_dpotrf_nb(n); if ( uplo != MagmaUpper && uplo != MagmaLower ) { *info = -1; } else if (n < 0) { *info = -2; } else if (uplo != MagmaUpper) { lddp = nb*(n/(nb*tot_subs)); if( n%(nb*tot_subs) != 0 ) lddp+=min(nb,n-tot_subs*lddp); if( ldda < lddp ) *info = -4; } else if( ldda < n ) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } if (num_gpus == 1 && ((nb <= 1) || (nb >= n)) ) { /* Use unblocked code. */ err = magma_dmalloc_cpu( &work, n*nb ); if (err != MAGMA_SUCCESS) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } magma_dgetmatrix( n, n, d_lA[0], 0, ldda, work, 0, n, queues[0] ); lapackf77_dpotrf(lapack_uplo_const(uplo), &n, work, &n, info); magma_dsetmatrix( n, n, work, 0, n, d_lA[0], 0, ldda, queues[0] ); magma_free_cpu( work ); } else { lddp = 32*((n+31)/32); for (d=0; d<num_gpus; d++) { if (MAGMA_SUCCESS != magma_dmalloc( &dwork[d], num_gpus*nb*lddp )) { for( j=0; j<d; j++ ) magma_free( dwork[j] ); *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } } h = 1; //num_gpus; //(n+nb-1)/nb; #ifdef USE_PINNED_CLMEMORY cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(double)*n*nb*h, NULL, NULL); for (d=0; d<num_gpus; d++) { work = (double*)clEnqueueMapBuffer(queues[2*d], buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(double)*n*nb*h, 0, NULL, NULL, NULL); } #else if (MAGMA_SUCCESS != magma_dmalloc_cpu( &work, n*nb*h )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } #endif if (uplo == MagmaUpper) { /* with two queues for each device */ magma_dpotrf2_msub(num_subs, num_gpus, uplo, n, n, 0, 0, nb, d_lA, 0, ldda, dwork, lddp, work, n, h, info, queues); //magma_dpotrf3_msub(num_subs, num_gpus, uplo, n, n, 0, 0, nb, d_lA, 0, ldda, // dwork, lddp, work, n, h, info, queues); /* with three streams */ //magma_dpotrf3_msub(num_gpus, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, n, // h, stream, event, info); } else { /* with two queues for each device */ magma_dpotrf2_msub(num_subs, num_gpus, uplo, n, n, 0, 0, nb, d_lA, 0, ldda, dwork, lddp, work, nb*h, h, info, queues); //magma_dpotrf3_msub(num_subs, num_gpus, uplo, n, n, 0, 0, nb, d_lA, 0, ldda, // dwork, lddp, work, nb*h, h, info, queues); //magma_dpotrf4_msub(num_subs, num_gpus, uplo, n, n, 0, 0, nb, d_lA, 0, ldda, // dwork, lddp, work, nb*h, h, info, queues); /* with three streams */ //magma_dpotrf3_msub(num_gpus, uplo, n, n, 0, 0, nb, d_lA, ldda, dwork, lddp, work, nb*h, // h, stream, event, info); } /* clean up */ for (d=0; d<num_gpus; d++) magma_free( dwork[d] ); #ifdef USE_PINNED_CLMEMORY for (d=0; d<num_gpus; d++) { clEnqueueUnmapMemObject(queues[2*d], buffer, work, 0, NULL, NULL); } clReleaseMemObject( buffer ); #else magma_free_cpu( work ); #endif } /* end of not lapack */ return *info; } /* magma_dpotrf_msub */
extern "C" void mixbenchGPU(cl_device_id dev_id, double *c, long size, bool block_strided, bool host_allocated, size_t workgroupsize, unsigned int elements_per_wi, unsigned int fusion_degree) { const char *benchtype; if(block_strided) benchtype = "Workgroup"; else benchtype = "NDRange"; printf("Workitem stride: %s\n", benchtype); const char *buffer_allocation = host_allocated ? "Host allocated" : "Device allocated"; printf("Buffer allocation: %s\n", buffer_allocation); // Set context properties cl_platform_id p_id; OCL_SAFE_CALL( clGetDeviceInfo(dev_id, CL_DEVICE_PLATFORM, sizeof(p_id), &p_id, NULL) ); size_t length; OCL_SAFE_CALL( clGetDeviceInfo(dev_id, CL_DEVICE_EXTENSIONS, 0, NULL, &length) ); char *extensions = (char*)alloca(length); OCL_SAFE_CALL( clGetDeviceInfo(dev_id, CL_DEVICE_EXTENSIONS, length, extensions, NULL) ); bool enable_dp = strstr(extensions, "cl_khr_fp64") != NULL; cl_context_properties ctxProps[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)p_id, 0 }; cl_int errno; // Create context cl_context context = clCreateContext(ctxProps, 1, &dev_id, NULL, NULL, &errno); OCL_SAFE_CALL(errno); cl_mem_flags buf_flags = CL_MEM_READ_WRITE; if( host_allocated ) buf_flags |= CL_MEM_ALLOC_HOST_PTR; cl_mem c_buffer = clCreateBuffer(context, buf_flags, size*sizeof(double), NULL, &errno); OCL_SAFE_CALL(errno); // Create command queue cl_command_queue cmd_queue = clCreateCommandQueue(context, dev_id, CL_QUEUE_PROFILING_ENABLE, &errno); OCL_SAFE_CALL(errno); // Set data on device memory cl_int *mapped_data = (cl_int*)clEnqueueMapBuffer(cmd_queue, c_buffer, CL_TRUE, CL_MAP_WRITE, 0, size*sizeof(double), 0, NULL, NULL, &errno); OCL_SAFE_CALL(errno); for(int i=0; i<size; i++) mapped_data[i] = 0; clEnqueueUnmapMemObject(cmd_queue, c_buffer, mapped_data, 0, NULL, NULL); // Load source, create program and all kernels printf("Loading kernel source file...\n"); const char c_param_format_str[] = "-cl-std=CL1.1 -cl-mad-enable -Dclass_T=%s -Dblockdim=" SIZE_T_FORMAT " -DCOMPUTE_ITERATIONS=%d -DELEMENTS_PER_THREAD=%d -DFUSION_DEGREE=%d %s %s"; const char *c_empty = ""; const char *c_striding = block_strided ? "-DBLOCK_STRIDED" : c_empty; const char *c_enable_dp = "-DENABLE_DP"; char c_build_params[256]; const char *c_kernel_source = {ReadFile("mix_kernels_ro.cl")}; printf("Precompilation of kernels... "); sprintf(c_build_params, c_param_format_str, "short", workgroupsize, 0, 1, 1, c_striding, c_empty); cl_kernel kernel_warmup = BuildKernel(context, dev_id, c_kernel_source, c_build_params); show_progress_init(compute_iterations_len); cl_kernel kernels[kdt_double+1][compute_iterations_len]; for(int i=0; i<compute_iterations_len; i++) { show_progress_step(0, '\\'); sprintf(c_build_params, c_param_format_str, "float", workgroupsize, compute_iterations[i], elements_per_wi, fusion_degree, c_striding, c_empty); //printf("%s\n",c_build_params); kernels[kdt_float][i] = BuildKernel(context, dev_id, c_kernel_source, c_build_params); show_progress_step(0, '|'); sprintf(c_build_params, c_param_format_str, "int", workgroupsize, compute_iterations[i], elements_per_wi, fusion_degree, c_striding, c_empty); //printf("%s\n",c_build_params); kernels[kdt_int][i] = BuildKernel(context, dev_id, c_kernel_source, c_build_params); if( enable_dp ) { show_progress_step(0, '/'); sprintf(c_build_params, c_param_format_str, "double", workgroupsize, compute_iterations[i], elements_per_wi, fusion_degree, c_striding, c_enable_dp); //printf("%s\n",c_build_params); kernels[kdt_double][i] = BuildKernel(context, dev_id, c_kernel_source, c_build_params); } else kernels[kdt_double][i] = 0; show_progress_step(1, '>'); } show_progress_done(); free((char*)c_kernel_source); runbench_warmup(cmd_queue, kernel_warmup, c_buffer, size, workgroupsize); // Synchronize in order to wait for memory operations to finish OCL_SAFE_CALL( clFinish(cmd_queue) ); printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n"); printf("Experiment ID, Single Precision ops,,,, Double precision ops,,,, Integer operations,,, \n"); printf("Compute iters, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Iops/byte, ex.time, GIOPS, GB/sec\n"); for(int i=0; i<compute_iterations_len; i++) runbench(compute_iterations, i, cmd_queue, kernels, c_buffer, size, workgroupsize, elements_per_wi, fusion_degree); printf("------------------------------------------------------------------------------------------------------------------------------\n"); // Copy results back to host memory OCL_SAFE_CALL( clEnqueueReadBuffer(cmd_queue, c_buffer, CL_TRUE, 0, size*sizeof(double), c, 0, NULL, NULL) ); // Release kernels and program ReleaseKernelNProgram(kernel_warmup); for(int i=0; i<compute_iterations_len; i++) { ReleaseKernelNProgram(kernels[kdt_float][i]); ReleaseKernelNProgram(kernels[kdt_int][i]); if( enable_dp ) ReleaseKernelNProgram(kernels[kdt_double][i]); } // Release buffer OCL_SAFE_CALL( clReleaseMemObject(c_buffer) ); }
int BinomialOption::runCLKernels() { cl_int status; cl_event ndrEvt; cl_int eventStatus = CL_QUEUED; cl_event inMapEvt; void* mapPtr = clEnqueueMapBuffer(commandQueue, randBuffer, CL_FALSE, CL_MAP_WRITE, 0, numSamples * sizeof(cl_float4), 0, NULL, &inMapEvt, &status); CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer failed. (inputBuffer)"); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed."); status = sampleCommon->waitForEventAndRelease(&inMapEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(inMapEvt) Failed"); memcpy(mapPtr, randArray, numSamples * sizeof(cl_float4)); cl_event inUnmapEvent; status = clEnqueueUnmapMemObject(commandQueue, randBuffer, mapPtr, 0, NULL, &inUnmapEvent); CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject failed. (randBuffer)"); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed."); status = sampleCommon->waitForEventAndRelease(&inUnmapEvent); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(inUnmapEvent) Failed"); // Set appropriate arguments to the kernel status = clSetKernelArg(kernel, 0, sizeof(int), (void*)&numSteps); CHECK_OPENCL_ERROR(status, "clSetKernelArg(numSteps) failed."); status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&randBuffer); CHECK_OPENCL_ERROR(status, "clSetKernelArg(randBuffer) failed."); status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&outBuffer); CHECK_OPENCL_ERROR(status, "clSetKernelArg(outBuffer) failed."); status = clSetKernelArg(kernel, 3, (numSteps + 1) * sizeof(cl_float4), NULL); CHECK_OPENCL_ERROR(status, "clSetKernelArg(callA) failed."); status = clSetKernelArg(kernel, 4, numSteps * sizeof(cl_float4), NULL); CHECK_OPENCL_ERROR(status, "clSetKernelArg(callB) failed."); // Enqueue a kernel run call. size_t globalThreads[] = {numSamples * (numSteps + 1)}; size_t localThreads[] = {numSteps + 1}; if(localThreads[0] > deviceInfo.maxWorkItemSizes[0] || localThreads[0] > deviceInfo.maxWorkGroupSize) { std::cout << "Unsupported: Device does not support" "requested number of work items."; return SDK_FAILURE; } if(kernelInfo.localMemoryUsed > deviceInfo.localMemSize) { std::cout << "Unsupported: Insufficient local memory on device." << std::endl; return SDK_FAILURE; } /** * This algorithm reduces each group of work-items to a single value * on OpenCL device */ status = clEnqueueNDRangeKernel( commandQueue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, &ndrEvt); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel() failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); status = sampleCommon->waitForEventAndRelease(&ndrEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(ndrEvt) Failed"); cl_event outMapEvt; cl_uint* outMapPtr = (cl_uint*)clEnqueueMapBuffer(commandQueue, outBuffer, CL_FALSE, CL_MAP_READ, 0, numSamples * sizeof(cl_float4), 0, NULL, &outMapEvt, &status); CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer(outputBuffer) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed."); status = sampleCommon->waitForEventAndRelease(&outMapEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(outMapEvt) Failed"); memcpy(output, outMapPtr, numSamples * sizeof(cl_float4)); cl_event outUnmapEvt; status = clEnqueueUnmapMemObject(commandQueue, outBuffer, (void*)outMapPtr, 0, NULL, &outUnmapEvt); CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject(outputBuffer) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed."); status = sampleCommon->waitForEventAndRelease(&outUnmapEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(outUnmapEvt) Failed"); return SDK_SUCCESS; }
int main(int argc, char *argv[]) { // selected platform and device number cl_uint pn = 0, dn = 0; // OpenCL error cl_int error; // generic iterator cl_uint i; // major/minor version of the platform OpenCL version cl_uint ocl_major, ocl_minor; // set platform/device num from command line if (argc > 1) pn = atoi(argv[1]); if (argc > 2) dn = atoi(argv[2]); error = clGetPlatformIDs(0, NULL, &np); CHECK_ERROR("getting amount of platform IDs"); printf("%u platforms found\n", np); if (pn >= np) { fprintf(stderr, "there is no platform #%u\n" , pn); exit(1); } // only allocate for IDs up to the intended one platform = calloc(pn+1,sizeof(*platform)); // if allocation failed, next call will bomb. rely on this error = clGetPlatformIDs(pn+1, platform, NULL); CHECK_ERROR("getting platform IDs"); // choose platform p = platform[pn]; error = clGetPlatformInfo(p, CL_PLATFORM_NAME, BUFSZ, strbuf, NULL); CHECK_ERROR("getting platform name"); printf("using platform %u: %s\n", pn, strbuf); error = clGetPlatformInfo(p, CL_PLATFORM_VERSION, BUFSZ, strbuf, NULL); CHECK_ERROR("getting platform version"); // we need 1.2 at least i = sscanf(strbuf, "OpenCL %u.%u ", &ocl_major, &ocl_minor); if (i != 2) { fprintf(stderr, "%s:%u: unable to determine platform OpenCL version\n", __func__, __LINE__); exit(1); } if (ocl_major == 1 && ocl_minor < 2) { fprintf(stderr, "%s:%u: Platform version %s is not at least 1.2\n", __func__, __LINE__, strbuf); exit(1); } error = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, 0, NULL, &nd); CHECK_ERROR("getting amount of device IDs"); printf("%u devices found\n", nd); if (dn >= nd) { fprintf(stderr, "there is no device #%u\n", dn); exit(1); } // only allocate for IDs up to the intended one device = calloc(dn+1,sizeof(*device)); // if allocation failed, next call will bomb. rely on this error = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, dn+1, device, NULL); CHECK_ERROR("getting device IDs"); // choose device d = device[dn]; error = clGetDeviceInfo(d, CL_DEVICE_NAME, BUFSZ, strbuf, NULL); CHECK_ERROR("getting device name"); printf("using device %u: %s\n", dn, strbuf); error = clGetDeviceInfo(d, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(gmem), &gmem, NULL); CHECK_ERROR("getting device global memory size"); error = clGetDeviceInfo(d, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(alloc_max), &alloc_max, NULL); CHECK_ERROR("getting device max memory allocation size"); // create context ctx_prop[1] = (cl_context_properties)p; ctx = clCreateContext(ctx_prop, 1, &d, NULL, NULL, &error); CHECK_ERROR("creating context"); // create queue q = clCreateCommandQueue(ctx, d, CL_QUEUE_PROFILING_ENABLE, &error); CHECK_ERROR("creating queue"); // create program pg = clCreateProgramWithSource(ctx, sizeof(src)/sizeof(*src), src, NULL, &error); CHECK_ERROR("creating program"); // build program error = clBuildProgram(pg, 1, &d, NULL, NULL, NULL); CHECK_ERROR("building program"); // get kernel k = clCreateKernel(pg, "add", &error); CHECK_ERROR("creating kernel"); error = clGetKernelWorkGroupInfo(k, d, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(wgm), &wgm, NULL); CHECK_ERROR("getting preferred workgroup size multiple"); // number of elements on which kernel will be launched. it's ok if we don't // cover every byte of the buffers nels = alloc_max/sizeof(cl_float); gws = ROUND_MUL(nels, wgm); printf("will use %zu workitems grouped by %zu to process %u elements\n", gws, wgm, nels); // we will try and allocate at least one buffer more than needed to fill // the device memory, and no less than 3 anyway nbuf = gmem/alloc_max + 1; if (nbuf < 3) nbuf = 3; #define MB (1024*1024.0) printf("will try allocating %u host buffers of %gMB each to overcommit %gMB\n", nbuf, alloc_max/MB, gmem/MB); hostbuf = calloc(nbuf, sizeof(cl_mem)); if (!hostbuf) { fprintf(stderr, "could not prepare support for %u buffers\n", nbuf); exit(1); } // allocate ‘host’ buffers for (i = 0; i < nbuf; ++i) { hostbuf[i] = clCreateBuffer(ctx, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY, alloc_max, NULL, &error); CHECK_ERROR("allocating host buffer"); printf("host buffer %u allocated\n", i); error = clEnqueueMigrateMemObjects(q, 1, hostbuf + i, CL_MIGRATE_MEM_OBJECT_HOST | CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED, 0, NULL, NULL); CHECK_ERROR("migrating buffer to host"); printf("buffer %u migrated to host\n", i); } // allocate ‘device’ buffers for (i = 0; i < 2; ++i) { devbuf[i] = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, alloc_max, NULL, &error); CHECK_ERROR("allocating devbuffer"); printf("dev buffer %u allocated\n", i); if (i == 0) { float patt = 0; error = clEnqueueFillBuffer(q, devbuf[0], &patt, sizeof(patt), 0, nels*sizeof(patt), 0, NULL, &mem_evt); CHECK_ERROR("enqueueing memset"); } } error = clWaitForEvents(1, &mem_evt); CHECK_ERROR("waiting for buffer fill"); clReleaseEvent(mem_evt); mem_evt = NULL; // use the buffers for (i = 0; i < nbuf; ++i) { printf("testing buffer %u\n", i); // for each buffer, we do a setup on CPU and then use it as second // argument for the kernel hbuf = clEnqueueMapBuffer(q, hostbuf[i], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, alloc_max, 0, NULL, NULL, &error); CHECK_ERROR("mapping buffer"); for (e = 0; e < nels; ++e) hbuf[e] = i; error = clEnqueueUnmapMemObject(q, hostbuf[i], hbuf, 0, NULL, NULL); CHECK_ERROR("unmapping buffer"); hbuf = NULL; // copy ‘host’ to ‘device’ buffer clEnqueueCopyBuffer(q, hostbuf[i], devbuf[1], 0, 0, alloc_max, 0, NULL, NULL); // make sure all pending actions are completed error = clFinish(q); CHECK_ERROR("settling down"); clSetKernelArg(k, 0, sizeof(cl_mem), devbuf); clSetKernelArg(k, 1, sizeof(cl_mem), devbuf + 1); clSetKernelArg(k, 2, sizeof(nels), &nels); error = clEnqueueNDRangeKernel(q, k, 1, NULL, &gws, &wgm, 0, NULL, &krn_evt); CHECK_ERROR("enqueueing kernel"); error = clEnqueueCopyBuffer(q, devbuf[0], hostbuf[0], 0, 0, alloc_max, 1, &krn_evt, &mem_evt); CHECK_ERROR("copying data to host"); expected = i*(i+1)/2.0f; hbuf = clEnqueueMapBuffer(q, hostbuf[0], CL_TRUE, CL_MAP_READ, 0, alloc_max, 1, &mem_evt, NULL, &error); CHECK_ERROR("mapping buffer 0"); for (e = 0; e < nels; ++e) if (hbuf[e] != expected) { fprintf(stderr, "mismatch @ %u: %g instead of %g\n", e, hbuf[e], expected); exit(1); } error = clEnqueueUnmapMemObject(q, hostbuf[0], hbuf, 0, NULL, NULL); CHECK_ERROR("unmapping buffer 0"); hbuf = NULL; clReleaseEvent(krn_evt); clReleaseEvent(mem_evt); krn_evt = mem_evt = NULL; } for (i = 1; i <= 2; ++i) { clReleaseMemObject(devbuf[2 - i]); printf("dev buffer %u freed\n", nbuf - i); } for (i = 1; i <= nbuf; ++i) { clReleaseMemObject(hostbuf[nbuf - i]); printf("host buffer %u freed\n", nbuf - i); } return 0; }
/* ------- Create and destroy necessary objects ------- */ static void create_clobj(int gws, struct fmt_main * self) { self->params.min_keys_per_crypt = self->params.max_keys_per_crypt = gws; pinned_saved_keys = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(sha512_password) * gws, NULL, &ret_code); HANDLE_CLERROR(ret_code, "Error creating page-locked memory pinned_saved_keys"); plaintext = (sha512_password *) clEnqueueMapBuffer(queue[ocl_gpu_id], pinned_saved_keys, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(sha512_password) * gws, 0, NULL, NULL, &ret_code); HANDLE_CLERROR(ret_code, "Error mapping page-locked memory saved_plain"); pinned_partial_hashes = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(sha512_hash) * gws, NULL, &ret_code); HANDLE_CLERROR(ret_code, "Error creating page-locked memory pinned_partial_hashes"); calculated_hash = (sha512_hash *) clEnqueueMapBuffer(queue[ocl_gpu_id], pinned_partial_hashes, CL_TRUE, CL_MAP_READ, 0, sizeof(sha512_hash) * gws, 0, NULL, NULL, &ret_code); HANDLE_CLERROR(ret_code, "Error mapping page-locked memory out_hashes"); // create arguments (buffers) salt_buffer = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_ONLY, sizeof(sha512_salt), NULL, &ret_code); HANDLE_CLERROR(ret_code, "Error creating salt_buffer out argument"); pass_buffer = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_ONLY, sizeof(sha512_password) * gws, NULL, &ret_code); HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_keys"); hash_buffer = clCreateBuffer(context[ocl_gpu_id], CL_MEM_WRITE_ONLY, sizeof(sha512_hash) * gws, NULL, &ret_code); HANDLE_CLERROR(ret_code, "Error creating buffer argument buffer_out"); work_buffer = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_WRITE, sizeof(sha512_buffers) * gws, NULL, &ret_code); HANDLE_CLERROR(ret_code, "Error creating buffer argument work_area"); //Set kernel arguments HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 0, sizeof(cl_mem), (void *) &salt_buffer), "Error setting argument 0"); HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 1, sizeof(cl_mem), (void *) &pass_buffer), "Error setting argument 1"); HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 2, sizeof(cl_mem), (void *) &hash_buffer), "Error setting argument 2"); if (gpu(source_in_use) || use_local(source_in_use)) { //Set prepare kernel arguments HANDLE_CLERROR(clSetKernelArg(prepare_kernel, 0, sizeof(cl_mem), (void *) &salt_buffer), "Error setting argument 0"); HANDLE_CLERROR(clSetKernelArg(prepare_kernel, 1, sizeof(cl_mem), (void *) &pass_buffer), "Error setting argument 1"); HANDLE_CLERROR(clSetKernelArg(prepare_kernel, 2, sizeof(cl_mem), (void *) &work_buffer), "Error setting argument 2"); //Fast working memory. HANDLE_CLERROR(clSetKernelArg(prepare_kernel, 3, sizeof(sha512_password) * local_work_size, NULL), "Error setting argument 3"); if (use_local(source_in_use)) { HANDLE_CLERROR(clSetKernelArg(prepare_kernel, 4, sizeof(sha512_buffers) * local_work_size, NULL), "Error setting argument 4"); HANDLE_CLERROR(clSetKernelArg(prepare_kernel, 5, sizeof(sha512_ctx) * local_work_size, NULL), "Error setting argument 5"); } //Set crypt kernel arguments HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 3, sizeof(cl_mem), (void *) &work_buffer), "Error setting argument crypt_kernel (3)"); if (use_local(source_in_use)) { //Fast working memory. HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 4, sizeof(sha512_buffers) * local_work_size, NULL), "Error setting argument 4"); HANDLE_CLERROR(clSetKernelArg(crypt_kernel, 5, sizeof(sha512_ctx) * local_work_size, NULL), "Error setting argument 5"); } //Set final kernel arguments HANDLE_CLERROR(clSetKernelArg(final_kernel, 0, sizeof(cl_mem), (void *) &salt_buffer), "Error setting argument 0"); HANDLE_CLERROR(clSetKernelArg(final_kernel, 1, sizeof(cl_mem), (void *) &pass_buffer), "Error setting argument 1"); HANDLE_CLERROR(clSetKernelArg(final_kernel, 2, sizeof(cl_mem), (void *) &hash_buffer), "Error setting argument 2"); HANDLE_CLERROR(clSetKernelArg(final_kernel, 3, sizeof(cl_mem), (void *) &work_buffer), "Error setting argument crypt_kernel (3)"); if (use_local(source_in_use)) { //Fast working memory. HANDLE_CLERROR(clSetKernelArg(final_kernel, 4, sizeof(sha512_buffers) * local_work_size, NULL), "Error setting argument 4"); HANDLE_CLERROR(clSetKernelArg(final_kernel, 5, sizeof(sha512_ctx) * local_work_size, NULL), "Error setting argument 5"); } } memset(plaintext, '\0', sizeof(sha512_password) * gws); global_work_size = gws; }
void Render(float delta) { clEnqueueNDRangeKernel( queue, kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL); // 7. Look at the results via synchronous buffer map. cl_float4 *ptr = (cl_float4 *) clEnqueueMapBuffer( queue, buffer, CL_TRUE, CL_MAP_READ, 0, kWidth * kHeight * sizeof(cl_float4), 0, NULL, NULL, NULL ); cl_float *viewTransformPtr = (cl_float *) clEnqueueMapBuffer( queue, viewTransform, CL_TRUE, CL_MAP_WRITE, 0, 16 * sizeof(cl_float), 0, NULL, NULL, NULL ); cl_float *worldTransformsPtr = (cl_float *) clEnqueueMapBuffer( queue, worldTransforms, CL_TRUE, CL_MAP_WRITE, 0, 16 * sizeof(cl_float)*2, 0, NULL, NULL, NULL ); memcpy(viewTransformPtr, viewMatrix, sizeof(float)*16); memcpy(worldTransformsPtr, sphereTransforms[0], sizeof(float)*16); memcpy(worldTransformsPtr+16, sphereTransforms[1], sizeof(float)*16); clEnqueueUnmapMemObject(queue, viewTransform, viewTransformPtr, 0, 0, 0); clEnqueueUnmapMemObject(queue, worldTransforms, worldTransformsPtr, 0, 0, 0); unsigned char* pixels = new unsigned char[kWidth*kHeight*4]; for(int i=0; i < kWidth * kHeight; i++){ pixels[i*4] = ptr[i].s[0]*255; pixels[i*4+1] = ptr[i].s[1]*255; pixels[i*4+2] = ptr[i].s[2]*255; pixels[i*4+3] = 1; } glBindTexture(GL_TEXTURE_2D, 1); glTexParameterf( GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR ); glTexParameterf( GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR ); glTexImage2D(GL_TEXTURE_2D, 0, 4, kWidth, kHeight, 0, GL_RGBA, GL_UNSIGNED_BYTE, pixels); delete [] pixels; glClearColor(1,1,1,1); glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); glMatrixMode(GL_PROJECTION); glLoadIdentity(); glOrtho(-1,1,1,-1,1,100); glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glBegin(GL_QUADS); glTexCoord2f(0,1); glVertex3f(-1,-1,-1); glTexCoord2f(0,0); glVertex3f(-1,1,-1); glTexCoord2f(1,0); glVertex3f(1,1,-1); glTexCoord2f(1,1); glVertex3f(1,-1,-1); glEnd(); clFinish( queue ); SDL_GL_SwapWindow(window); }
int main(int argc, char **argv){ printf("Check OpenCL environtment\n"); cl_platform_id platid; cl_device_id devid; cl_int res; size_t param; /* Query OpenCL, get some information about the returned device */ clGetPlatformIDs(1u, &platid, NULL); clGetDeviceIDs(platid, CL_DEVICE_TYPE_ALL, 1, &devid, NULL); cl_char vendor_name[1024] = {0}; cl_char device_name[1024] = {0}; clGetDeviceInfo(devid, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, NULL); clGetDeviceInfo(devid, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); printf("Connecting to OpenCL device:\t%s %s\n", vendor_name, device_name); clGetDeviceInfo(devid, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), ¶m, NULL); printf("CL_DEVICE_MAX_COMPUTE_UNITS\t%d\n", param); clGetDeviceInfo(devid, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), ¶m, NULL); printf("CL_DEVICE_MAX_WORK_GROUP_SIZE\t%u\n", param); clGetDeviceInfo(devid, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), ¶m, NULL); printf("CL_DEVICE_LOCAL_MEM_SIZE\t%ub\n", param); /* Check if kernel source exists, we compile argv[1] passed kernel */ if(argv[1] == NULL) { printf("\nUsage: %s kernel_source.cl kernel_function\n", argv[0]); exit(1); } char *kernel_source; if(load_program_source(argv[1], &kernel_source)) return 1; printf("Building from OpenCL source: \t%s\n", argv[1]); printf("Compile/query OpenCL_program:\t%s\n", argv[2]); /* Create context and kernel program */ cl_context context = clCreateContext(0, 1, &devid, NULL, NULL, NULL); cl_program pro = clCreateProgramWithSource(context, 1, (const char **)&kernel_source, NULL, NULL); res = clBuildProgram(pro, 1, &devid, "-cl-fast-relaxed-math", NULL, NULL); if(res != CL_SUCCESS){ printf("clBuildProgram failed: %d\n", res); char buf[0x10000]; clGetProgramBuildInfo(pro, devid, CL_PROGRAM_BUILD_LOG, 0x10000, buf, NULL); printf("\n%s\n", buf); return(-1); } cl_kernel kernelobj = clCreateKernel(pro, argv[2], &res); check_return(res); /* Get the maximum work-group size for executing the kernel on the device */ size_t global, local; res = clGetKernelWorkGroupInfo(kernelobj, devid, CL_KERNEL_WORK_GROUP_SIZE, sizeof(int), &local, NULL); check_return(res); printf("CL_KERNEL_WORK_GROUP_SIZE\t%u\n", local); res = clGetKernelWorkGroupInfo(kernelobj, devid, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), ¶m, NULL); check_return(res); printf("CL_KERNEL_LOCAL_MEM_SIZE\t%ub\n", param); cl_command_queue cmd_queue = clCreateCommandQueue(context, devid, CL_QUEUE_PROFILING_ENABLE, NULL); if(cmd_queue == NULL) { printf("Compute device setup failed\n"); return(-1); } local = 4; int n = 2 * local; //num_group * local workgroup size global = n; int num_groups= global / local, allocated_local= sizeof(data) * local + sizeof(debug) * local; data *DP __attribute__ ((aligned(16))); DP = calloc(n, sizeof(data) *1); debug *dbg __attribute__ ((aligned(16))); dbg = calloc(n, sizeof(debug)); printf("global:%d, local:%d, (should be):%d groups\n", global, local, num_groups); printf("structs size: %db, %db, %db\n", sizeof(data), sizeof(Elliptic_Curve), sizeof(inv256)); printf("sets:%d, total of %db needed, allocated _local: %db\n", n, n * sizeof(cl_uint4) *5 *4, allocated_local); cl_mem cl_DP, cl_EC, cl_INV, DEBUG; cl_DP = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, n * sizeof(data), NULL, &res); check_return(res); cl_EC = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY, 1 * sizeof(Elliptic_Curve), NULL, &res); check_return(res); //_constant address space cl_INV= clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY, 1 * sizeof(u8) * 0x80, NULL, &res); check_return(res); DEBUG = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_WRITE_ONLY, n * sizeof(debug), NULL, &res); check_return(res); Elliptic_Curve EC; /* Curve domain parameters, (test vectors) ------------------------------------------------------------------------------------- p: c1c627e1638fdc8e24299bb041e4e23af4bb5427 is prime a: c1c627e1638fdc8e24299bb041e4e23af4bb5424 divisor g = 62980 b: 877a6d84155a1de374b72d9f9d93b36bb563b2ab divisor g = 227169643 Gx: 010aff82b3ac72569ae645af3b527be133442131 divisor g = 32209245 Gy: 46b8ec1e6d71e5ecb549614887d57a287df573cc divisor g = 972 precomputed_per_curve_constants: U: c1c627e1638fdc8e24299bb041e4e23af4bb5425 V: 3e39d81e9c702371dbd6644fbe1b1dc50b44abd9 already prepared mod p to test: a: 07189f858e3f723890a66ec1079388ebd2ed509c b: 6043379beb0dade6eed1e9d6de64f4a0c50639d4 gx: 5ef84aacf4f0ea6752f572d0741f40049f354dca gy: 418c695435af6b3d4d7cbb72967395016ef67239 resulting point: P.x: 01718f862ebe9423bd661a65355aa1c86ba330f8 program MUST got this point !! P.y: 557e8ed53ffbfe2c990a121967b340f62e0e4fe2 taken mod p: P.x: 41da1a8f74ff8d3f1ce20ef3e9d8865c96014fe3 P.y: 73ca143c9badedf2d9d3c7573307115ccfe04f13 */ u8 *t; t = _x_to_u8_buffer("c1c627e1638fdc8e24299bb041e4e23af4bb5427"); memcpy(EC.p, t, 20); t = _x_to_u8_buffer("07189f858e3f723890a66ec1079388ebd2ed509c"); memcpy(EC.a, t, 20); t = _x_to_u8_buffer("6043379beb0dade6eed1e9d6de64f4a0c50639d4"); memcpy(EC.b, t, 20); t = _x_to_u8_buffer("5ef84aacf4f0ea6752f572d0741f40049f354dca"); memcpy(EC.Gx, t, 20); t = _x_to_u8_buffer("418c695435af6b3d4d7cbb72967395016ef67239"); memcpy(EC.Gy, t, 20); t = _x_to_u8_buffer("c1c627e1638fdc8e24299bb041e4e23af4bb5425"); memcpy(EC.U, t, 20); t = _x_to_u8_buffer("3e39d81e9c702371dbd6644fbe1b1dc50b44abd9"); memcpy(EC.V, t, 20); /* we need to map buffer now to load some k into data */ DP = clEnqueueMapBuffer(cmd_queue, cl_DP, CL_TRUE, CL_MAP_WRITE, 0, n * sizeof(data), 0, NULL, NULL, &res); check_return(res); t = _x_to_u8_buffer("00542d46e7b3daac8aeb81e533873aabd6d74bb710"); for(u8 i = 0; i < n; i++) memcpy(DP[i].k, t, 21); free(t); //d for(u8 i = 0; i < n; i++) bn_print("", DP[i].k, 21, 1); /* we can alter just a byte into a chosen k to verify that we'll get a different point! */ //DP[2].k[2] = 0x09; //no res = clEnqueueWriteBuffer(cmd_queue, cl_DP, CL_TRUE, 0, n * sizeof(data), &DP, 0, NULL, NULL); check_return(res); res = clEnqueueWriteBuffer(cmd_queue, cl_EC, CL_TRUE, 0, 1 * sizeof(Elliptic_Curve), &EC, 0, NULL, NULL); check_return(res); res = clEnqueueWriteBuffer(cmd_queue, cl_INV, CL_TRUE, 0, 1 * sizeof(u8) * 0x80, &inv256, 0, NULL, NULL); check_return(res); res = clSetKernelArg(kernelobj, 0, sizeof(cl_mem), &cl_DP); /* i/o buffer */ res|= clSetKernelArg(kernelobj, 1, sizeof(data) * local *1, NULL); //allocate space for __local in kernel (just this!) one * localsize res|= clSetKernelArg(kernelobj, 2, sizeof(cl_mem), &cl_EC); res|= clSetKernelArg(kernelobj, 3, sizeof(cl_mem), &cl_INV); res|= clSetKernelArg(kernelobj, 4, sizeof(debug) * local *1, NULL); //allocate space for __local in kernel (just this!) one * localsize res|= clSetKernelArg(kernelobj, 5, sizeof(cl_mem), &DEBUG); //this used to debug kernel output check_return(res); // printf("n:%d, total of %db needed, allocated _local: %db\n", n, n * sizeof(debug), allocated_local); cl_event NDRangeEvent; cl_ulong start, end; /* Execute NDrange */ res = clEnqueueNDRangeKernel(cmd_queue, kernelobj, 1, NULL, &global, &local, 0, NULL, &NDRangeEvent); check_return(res); // res = clEnqueueNDRangeKernel(cmd_queue, kernelobj, 1, NULL, &global, NULL, 0, NULL, &NDRangeEvent); check_return(res); printf("Read back, Mapping buffer:\t%db\n", n * sizeof(data)); DP = clEnqueueMapBuffer(cmd_queue, cl_DP, CL_TRUE, CL_MAP_READ, 0, n * sizeof(data), 0, NULL, NULL, &res); check_return(res); dbg =clEnqueueMapBuffer(cmd_queue, DEBUG, CL_TRUE, CL_MAP_READ, 0, n * sizeof(debug), 0, NULL, NULL, &res); check_return(res); /* using clEnqueueReadBuffer template */ // res = clEnqueueReadBuffer(cmd_queue, ST, CL_TRUE, 0, sets * sizeof(cl_uint8), dbg, 0, NULL, NULL); check_return(res); clFlush(cmd_queue); clFinish(cmd_queue); /* get NDRange execution time with internal ocl profiler */ res = clGetEventProfilingInfo(NDRangeEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); res|= clGetEventProfilingInfo(NDRangeEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); check_return(res); printf("kernel execution time:\t\t%.2f ms\n", (float) ((end - start) /1000000)); //relative to NDRange call printf("number of computes/sec:\t%.2f\n", (float) global *1000000 /((end - start))); printf("i,\tgid\tlid0\tlsize0\tgid0/lsz0,\tgsz0,\tn_gr0,\tlid5,\toffset\n"); for(int i = 0; i < n; i++) { // if(i %local == 0) { printf("%d \t", i); //printf("%u\t%u\t%u\t%u\t| %2u, %2u, %2u, %u\n", *p, *(p +1), *(p +2), *(p +3), *(p +4), *(p +5), *(p +6), *(p +7)); /* silence this doubled debug info printf("%u\t%u\t%u\t%u\t| %2u, %2u, %2u, %u\n", dbg[i].data[0], dbg[i].data[1], dbg[i].data[2], dbg[i].data[3], dbg[i].data[4], dbg[i].data[5], dbg[i].data[6], dbg[i].data[7]); */ //printf("%d %d\n", P[i].dig, P[i].c); bn_print("", DP[i].k, 21, 1); bn_print("", DP[i].rx, 20, 0); bn_print(" ", DP[i].ry, 20, 1); printf("%u(/%u) = %u*%u(/%u) +%u, offset:%u, stride:%u\n", DP[i].pad[0], DP[i].pad[1], DP[i].pad[2], DP[i].pad[3], DP[i].pad[4], DP[i].pad[5], DP[i].pad[6], DP[i].pad[7]); // } } /* Release OpenCL stuff, free the rest */ clReleaseMemObject(cl_DP); clReleaseMemObject(cl_EC); clReleaseMemObject(cl_INV); clReleaseMemObject(DEBUG); clReleaseKernel(kernelobj); clReleaseProgram(pro); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); free(kernel_source); puts("Done!"); return 0; }
int main(int argc, const char** argv) { cl_uint platform_count; cl_platform_id platforms[5]; cl_int err = CL_SUCCESS; unsigned int i, p; cl_device_type dev_type = CL_DEVICE_TYPE_ALL; void * ptrs[BLOCKS]; cl_command_queue cqs[BLOCKS]; cl_mem d_A[BLOCKS]; cl_mem d_C[BLOCKS]; cl_mem d_B[BLOCKS]; cl_event GPUDone[BLOCKS]; cl_event GPUExecution[BLOCKS]; struct timeval start, end; int workOffset[BLOCKS]; int workSize[BLOCKS]; unsigned int sizePerGPU = HC / BLOCKS; unsigned int sizeMod = HC % BLOCKS; size_t A_size = WA * HA; size_t A_mem_size = sizeof(TYPE) * A_size; TYPE* A_data; size_t B_size = WB * HB; size_t B_mem_size = sizeof(TYPE) * B_size; TYPE* B_data; size_t C_size = WC * HC; size_t C_mem_size = sizeof(TYPE) * C_size; TYPE* C_data; parse_args(argc, argv); check(clGetPlatformIDs(5, platforms, &platform_count)); if (platform_count == 0) { printf("No platform found\n"); exit(77); } cl_uint device_count; cl_uint devs[platform_count]; cl_device_id * devices[platform_count]; cl_context ctx[platform_count]; cl_command_queue * commandQueue[platform_count]; device_count = 0; for (p=0; p<platform_count; p++) { cl_platform_id platform = platforms[p]; err = clGetDeviceIDs(platform, dev_type, 0, NULL, &devs[p]); if (err == CL_DEVICE_NOT_FOUND) { devs[p] = 0; continue; } if (devs[p] == 0) { printf("No OpenCL device found\n"); exit(77); } if (err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d) in clGetDeviceIDs()\n", err); exit(EXIT_FAILURE); } if (devs[p] == 0) continue; devices[p] = (cl_device_id*)malloc(sizeof(cl_device_id) * devs[p]); commandQueue[p] = (cl_command_queue*)malloc(sizeof(cl_command_queue) * devs[p]); check(clGetDeviceIDs(platform, dev_type, devs[p], devices[p], NULL)); cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; check2(ctx[p] = clCreateContext(properties, devs[p], devices[p], NULL, NULL, &err)); for(i = 0; i < devs[p]; ++i) { cl_device_id device = devices[p][i]; char name[2048]; name[0] = '\0'; clGetDeviceInfo(device, CL_DEVICE_NAME, 2048, name, NULL); printf("Device %d: %s\n", i, name); check2(commandQueue[p][i] = clCreateCommandQueue(ctx[p], device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err)); } device_count += devs[p]; } if (device_count == 0) error("No device found\n"); cl_kernel multiplicationKernel[platform_count]; printf("\nUsing Matrix Sizes: A(%lu x %lu), B(%lu x %lu), C(%lu x %lu)\n", (unsigned long)WA, (unsigned long)HA, (unsigned long)WB, (unsigned long)HB, (unsigned long)WC, (unsigned long)HC); // allocate host memory for matrices A, B and C A_data = (TYPE*)malloc(A_mem_size); if (A_data == NULL) { perror("malloc"); } B_data = (TYPE*)malloc(B_mem_size); if (B_data == NULL) { perror("malloc"); } C_data = (TYPE*) malloc(C_mem_size); if (C_data == NULL) { perror("malloc"); } cl_program program[platform_count]; for (p=0; p<platform_count; p++) { if (devs[p] == 0) continue; check2(program[p] = clCreateProgramWithSource(ctx[p], 1, (const char **)&code, NULL, &err)); check(clBuildProgram(program[p], 0, NULL, NULL, NULL, NULL)); check2(multiplicationKernel[p] = clCreateKernel(program[p], "sgemmNN", &err)); } printf("Initializing data...\n"); srand(2008); fillArray(A_data, A_size); fillArray(B_data, B_size); memset(C_data, 0, C_size); printf("Computing...\n"); workOffset[0] = 0; gettimeofday(&start, NULL); size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE}; int c = 0; for (p=0; p<platform_count;p++) { for (i=0; i<devs[p]; i++) { check2(d_B[c] = clCreateBuffer(ctx[p], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, HB * WB * sizeof(TYPE), B_data, &err)); c++; } } for(i=0; i < BLOCKS; ++i) { int d = i % device_count; cl_uint platform = 0; // determine device platform int dev = d; for (platform = 0; platform < platform_count; platform++) { if ((cl_int)(dev - devs[platform]) < 0) break; dev -= devs[platform]; } workSize[i] = (i < sizeMod) ? sizePerGPU+1 : sizePerGPU; check2(d_A[i] = clCreateBuffer(ctx[platform], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WA * sizeof(TYPE), &A_data[workOffset[i] * WA], &err)); check2(d_C[i] = clCreateBuffer(ctx[platform], CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WC * sizeof(TYPE), &C_data[workOffset[i] * WC], &err)); check(clSetKernelArg(multiplicationKernel[platform], 0, sizeof(cl_int), &workSize[i])); check(clSetKernelArg(multiplicationKernel[platform], 1, sizeof(cl_int), &workSize[i])); check(clSetKernelArg(multiplicationKernel[platform], 2, sizeof(cl_int), &workSize[i])); check(clSetKernelArg(multiplicationKernel[platform], 3, sizeof(cl_mem), (void *) &d_A[i])); check(clSetKernelArg(multiplicationKernel[platform], 4, sizeof(cl_mem), (void *) &d_B[d])); check(clSetKernelArg(multiplicationKernel[platform], 5, sizeof(cl_mem), (void *) &d_C[i])); size_t globalWorkSize[] = {roundUp(BLOCK_SIZE,WC), roundUp(BLOCK_SIZE,workSize[i])}; check(clEnqueueNDRangeKernel(commandQueue[platform][dev], multiplicationKernel[platform], 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i])); // Non-blocking copy of result from device to host cqs[i] = commandQueue[platform][dev]; check2(ptrs[i] = clEnqueueMapBuffer(cqs[i], d_C[i], CL_FALSE, CL_MAP_READ, 0, WC * sizeof(TYPE) * workSize[i], 1, &GPUExecution[i], &GPUDone[i], &err)); if(i+1 < BLOCKS) workOffset[i + 1] = workOffset[i] + workSize[i]; } // CPU sync with GPU for (p=0; p<platform_count;p++) { cl_uint dev; for (dev=0; dev<devs[p]; dev++) { clFinish(commandQueue[p][dev]); } } gettimeofday(&end, NULL); double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec)); double dSeconds = timing/1000/1000; double dNumOps = 2.0 * (double)WA * (double)HA * (double)WB; double gflops = 1.0e-9 * dNumOps/dSeconds; printf("Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Blocks = %ld, Workgroup = %zu\n", gflops, dSeconds, dNumOps, device_count, BLOCKS, localWorkSize[0] * localWorkSize[1]); // compute reference solution if (check) { printf("Comparing results with CPU computation... "); TYPE* reference = (TYPE*)malloc(C_mem_size); computeReference(reference, A_data, B_data, HA, WA, WB); // check result int res = shrCompareL2fe(reference, C_data, C_size, 1.0e-6f); if (res == 0) { printf("\n\n"); printDiff(reference, C_data, WC, HC, 100, 1.0e-5f); } else printf("PASSED\n\n"); free(reference); } for(i = 0; i < BLOCKS; i++) { clEnqueueUnmapMemObject(cqs[i], d_C[i], ptrs[i], 0, NULL, NULL); } for(i = 0; i < BLOCKS; i++) { clFinish(cqs[i]); } for (i=0; i<device_count; i++) { clReleaseMemObject(d_B[i]); } for(i = 0; i < BLOCKS; i++) { clReleaseMemObject(d_A[i]); clReleaseMemObject(d_C[i]); clReleaseEvent(GPUExecution[i]); clReleaseEvent(GPUDone[i]); } for (p=0; p<platform_count;p++) { if (devs[p] == 0) continue; check(clReleaseKernel(multiplicationKernel[p])); check(clReleaseProgram(program[p])); check(clReleaseContext(ctx[p])); cl_uint k; for(k = 0; k < devs[p]; ++k) { check(clReleaseCommandQueue(commandQueue[p][k])); } } free(A_data); free(B_data); free(C_data); return 0; }
/////////////////////////////////////////////////////////////////////////////// // test the bandwidth of a device to host memcopy of a specific size /////////////////////////////////////////////////////////////////////////////// double testHostToDeviceTransfer(unsigned int memSize, accessMode accMode, memoryMode memMode) { double elapsedTimeInSec = 0.0; double bandwidthInMBs = 0.0; unsigned char* h_data = NULL; cl_mem cmPinnedData = NULL; cl_mem cmDevData = NULL; cl_int ciErrNum = CL_SUCCESS; // Allocate and init host memory, pinned or conventional if(memMode == PINNED) { // Create a host buffer cmPinnedData = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, memSize, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // Get a mapped pointer h_data = (unsigned char*)clEnqueueMapBuffer(cqCommandQueue, cmPinnedData, CL_TRUE, CL_MAP_WRITE, 0, memSize, 0, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); //initialize for(unsigned int i = 0; i < memSize/sizeof(unsigned char); i++) { h_data[i] = (unsigned char)(i & 0xff); } // unmap and make data in the host buffer valid ciErrNum = clEnqueueUnmapMemObject(cqCommandQueue, cmPinnedData, (void*)h_data, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); h_data = NULL; // buffer is unmapped } else { // standard host alloc h_data = (unsigned char *)malloc(memSize); //initialize for(unsigned int i = 0; i < memSize/sizeof(unsigned char); i++) { h_data[i] = (unsigned char)(i & 0xff); } } // allocate device memory cmDevData = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); // Sync queue to host, start timer 0, and copy data from Host to GPU clFinish(cqCommandQueue); shrDeltaT(0); if(accMode == DIRECT) { if(memMode == PINNED) { // Get a mapped pointer h_data = (unsigned char*)clEnqueueMapBuffer(cqCommandQueue, cmPinnedData, CL_TRUE, CL_MAP_READ, 0, memSize, 0, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); } // DIRECT: API access to device buffer for(unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevData, CL_FALSE, 0, memSize, h_data, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); } ciErrNum = clFinish(cqCommandQueue); oclCheckError(ciErrNum, CL_SUCCESS); } else { // MAPPED: mapped pointers to device buffer and conventional pointer access void* dm_idata = clEnqueueMapBuffer(cqCommandQueue, cmDevData, CL_TRUE, CL_MAP_WRITE, 0, memSize, 0, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); if(memMode == PINNED ) { h_data = (unsigned char*)clEnqueueMapBuffer(cqCommandQueue, cmPinnedData, CL_TRUE, CL_MAP_READ, 0, memSize, 0, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); } for(unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) { memcpy(dm_idata, h_data, memSize); } ciErrNum = clEnqueueUnmapMemObject(cqCommandQueue, cmDevData, dm_idata, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); } //get the the elapsed time in seconds elapsedTimeInSec = shrDeltaT(0); //calculate bandwidth in MB/s bandwidthInMBs = ((double)memSize * (double)MEMCOPY_ITERATIONS)/(elapsedTimeInSec * (double)(1 << 20)); //clean up memory if(cmDevData)clReleaseMemObject(cmDevData); if(cmPinnedData) { clEnqueueUnmapMemObject(cqCommandQueue, cmPinnedData, (void*)h_data, 0, NULL, NULL); clReleaseMemObject(cmPinnedData); } h_data = NULL; return bandwidthInMBs; }
magma_err_t magma_cgeqrf2_gpu( magma_int_t m, magma_int_t n, magmaFloatComplex_ptr dA, size_t dA_offset, magma_int_t ldda, magmaFloatComplex *tau, magma_err_t *info, magma_queue_t* queue) { /* -- clMAGMA (version 1.1.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date January 2014 Purpose ======= CGEQRF computes a QR factorization of a complex M-by-N matrix A: A = Q * R. Arguments ========= M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. dA (input/output) COMPLEX array on the GPU, dimension (LDDA,N) On entry, the M-by-N matrix dA. On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a product of min(m,n) elementary reflectors (see Further Details). LDDA (input) INTEGER The leading dimension of the array dA. LDDA >= max(1,M). To benefit from coalescent memory accesses LDDA must be dividable by 16. TAU (output) COMPLEX array, dimension (min(M,N)) The scalar factors of the elementary reflectors (see Further Details). INFO (output) INTEGER = 0: successful exit < 0: if INFO = -i, the i-th argument had an illegal value if INFO = -9, internal GPU memory allocation failed. Further Details =============== The matrix Q is represented as a product of elementary reflectors Q = H(1) H(2) . . . H(k), where k = min(m,n). Each H(i) has the form H(i) = I - tau * v * v' where tau is a complex scalar, and v is a complex vector with v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), and tau in TAU(i). ===================================================================== */ #define dA(a_1,a_2) dA, (dA_offset + (a_1) + (a_2)*(ldda)) #define work_ref(a_1) work, (a_1) #define work_href(a_1) ( work + (a_1)) #define hwork ( work + (nb)*(m)) #define hhwork work, ((nb)*(m)) magmaFloatComplex_ptr dwork; magmaFloatComplex *work; magma_int_t i, k, ldwork, lddwork, old_i, old_ib, rows; magma_int_t nbmin, nx, ib, nb; magma_int_t lhwork, lwork; *info = 0; if (m < 0) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,m)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } k = min(m,n); if (k == 0) return MAGMA_SUCCESS; nb = magma_get_cgeqrf_nb(m); lwork = (m+n) * nb; lhwork = lwork - (m)*nb; if ( MAGMA_SUCCESS != magma_cmalloc( &dwork, n*nb )) { *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } /* if ( MAGMA_SUCCESS != magma_cmalloc_cpu( &work, lwork ) ) { *info = MAGMA_ERR_HOST_ALLOC; magma_free( dwork ); return *info; } */ cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(magmaFloatComplex)*lwork, NULL, NULL); work = (magmaFloatComplex*)clEnqueueMapBuffer(queue[0], buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, lwork*sizeof(magmaFloatComplex), 0, NULL, NULL, NULL); nbmin = 2; nx = nb; ldwork = m; lddwork= n; if (nb >= nbmin && nb < k && nx < k) { /* Use blocked code initially */ old_i = 0; old_ib = nb; for (i = 0; i < k-nx; i += nb) { ib = min(k-i, nb); rows = m -i; magma_queue_sync( queue[1] ); chk(magma_cgetmatrix_async(rows, ib, dA(i, i), ldda, work_ref(i), ldwork, queue[0], NULL)); if (i>0){ /* Apply H' to A(i:m,i+2*ib:n) from the left */ magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-old_i, n-old_i-2*old_ib, old_ib, dA(old_i, old_i ), ldda, dwork,0, lddwork, dA(old_i, old_i+2*old_ib), ldda, dwork,old_ib, lddwork, queue[1]); chk(magma_csetmatrix_async( old_ib, old_ib, work_ref(old_i), ldwork, dA(old_i, old_i), ldda, queue[1], NULL)); } magma_queue_sync(queue[0]); lapackf77_cgeqrf(&rows, &ib, work_href(i), &ldwork, tau+i, hwork, &lhwork, info); /* Form the triangular factor of the block reflector H = H(i) H(i+1) . . . H(i+ib-1) */ lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr, &rows, &ib, work_href(i), &ldwork, tau+i, hwork, &ib); cpanel_to_q( MagmaUpper, ib, work_href(i), ldwork, hwork+ib*ib ); /* download the i-th V matrix */ chk(magma_csetmatrix_async(rows, ib, work_ref(i), ldwork, dA(i,i), ldda, queue[0], NULL)); /* download the T matrix */ magma_queue_sync( queue[1] ); chk(magma_csetmatrix_async( ib, ib, hhwork, ib, dwork, 0, lddwork, queue[0], NULL)); magma_queue_sync( queue[0] ); if (i + ib < n) { if (i+nb < k-nx) { /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */ magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, ib, ib, dA(i, i ), ldda, dwork,0, lddwork, dA(i, i+ib), ldda, dwork,ib, lddwork, queue[1]); cq_to_panel( MagmaUpper, ib, work_href(i), ldwork, hwork+ib*ib ); } else { magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, rows, n-i-ib, ib, dA(i, i ), ldda, dwork,0, lddwork, dA(i, i+ib), ldda, dwork,ib, lddwork, queue[1]); cq_to_panel( MagmaUpper, ib, work_href(i), ldwork, hwork+ib*ib ); chk(magma_csetmatrix_async(ib, ib, work_ref(i), ldwork, dA(i,i), ldda, queue[1], NULL)); } old_i = i; old_ib = ib; } } } else { i = 0; } magma_free(dwork); /* Use unblocked code to factor the last or only block. */ if (i < k) { ib = n-i; rows = m-i; magma_cgetmatrix_async(rows, ib, dA(i, i), ldda, work, 0, rows, queue[1], NULL); magma_queue_sync(queue[1]); lhwork = lwork - rows*ib; lapackf77_cgeqrf(&rows, &ib, work, &rows, tau+i, work+ib*rows, &lhwork, info); magma_csetmatrix_async(rows, ib, work, 0, rows, dA(i, i), ldda, queue[1], NULL); } magma_queue_sync(queue[0]); magma_queue_sync(queue[1]); // magma_free_cpu(work); clEnqueueUnmapMemObject(queue[0], buffer, work, 0, NULL, NULL); clReleaseMemObject(buffer); return *info; } /* magma_cgeqrf2_gpu */
int main(void) { cl_context context = 0; cl_command_queue command_waiting_line = 0; cl_program program = 0; cl_device_id device_id = 0; cl_kernel kernel = 0; // int numberOfMemoryObjects = 3; cl_mem memoryObjects[3] = {0, 0, 0}; cl_platform_id platform_id = NULL; cl_uint ret_num_devices; cl_int errorNumber; cl_int ret; /* Load the source code containing the kernel*/ char fileName[] = "source/parallel/composition_population.cl"; FILE *fp; char *source_str; size_t source_size; fp = fopen(fileName, "r"); cl_uint ret_num_platforms; if (!fp) { fprintf(stderr, "Failed to load kernel %s:%d.\n", __FILE__, __LINE__); exit(1); } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); // printf("file: %s :file", source_str); getInfo(); ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); if (!success_verification(ret)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to get platform id's. %s:%d\n", __FILE__, __LINE__); return 1; } ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); if (!success_verification(ret)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to get OpenCL devices. %s:%d\n", __FILE__, __LINE__); return 1; } context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); if (!success_verification(ret)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to create an OpenCL context. %s:%d\n", __FILE__, __LINE__); return 1; } #ifdef CL_VERSION_2_0 command_waiting_line = clCreateCommandQueueWithProperties(context, device_id, 0, &ret); #else command_waiting_line = clCreateCommandQueue(context, device_id, 0, &ret); #endif if (!success_verification(ret)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to create the OpenCL command queue. %s:%d\n", __FILE__, __LINE__); return 1; } /* create program */ program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); if (!success_verification(ret)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to create OpenCL program. %s:%d\n", __FILE__, __LINE__); return 1; } /* Build Kernel Program */ ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); if (!success_verification(ret)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to build OpenCL program. %s:%d\n", __FILE__, __LINE__); return 1; } kernel = clCreateKernel(program, "composition_population", &errorNumber); if (!success_verification(errorNumber)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to create OpenCL kernel. %s:%d\n", __FILE__, __LINE__); return 1; } /* [Setup memory] */ /* Number of elements in the arrays of input and output data. */ /* The buffers are the size of the arrays. */ uint16_t activity_atom_size = MAX_INDEPENDENTCLAUSE_TABLET * 1; uint8_t program_size = 1; uint8_t population_size = 4; size_t activity_atom_byte_size = activity_atom_size * sizeof(v16us); uint16_t population_byte_size = (uint16_t)(program_size * (uint16_t)(population_size * sizeof(v16us))); /* * Ask the OpenCL implementation to allocate buffers for the data. * We ask the OpenCL implemenation to allocate memory rather than allocating * it on the CPU to avoid having to copy the data later. * The read/write flags relate to accesses to the memory from within the * kernel. */ int createMemoryObjectsSuccess = TRUE; memoryObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, activity_atom_byte_size, NULL, &errorNumber); createMemoryObjectsSuccess &= success_verification(errorNumber); memoryObjects[1] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, population_byte_size, NULL, &errorNumber); createMemoryObjectsSuccess &= success_verification(errorNumber); memoryObjects[2] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, newspaper_byte_size, NULL, &errorNumber); createMemoryObjectsSuccess &= success_verification(errorNumber); if (!createMemoryObjectsSuccess) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to create OpenCL buffer. %s:%d\n", __FILE__, __LINE__); return 1; } /* [Setup memory] */ /* [Map the buffers to pointers] */ /* Map the memory buffers created by the OpenCL implementation to pointers so * we can access them on the CPU. */ int mapMemoryObjectsSuccess = TRUE; v16us *activity_atom = (v16us *)clEnqueueMapBuffer( command_waiting_line, memoryObjects[0], CL_TRUE, CL_MAP_WRITE, 0, activity_atom_byte_size, 0, NULL, NULL, &errorNumber); mapMemoryObjectsSuccess &= success_verification(errorNumber); // cl_int *inputB = (cl_int *)clEnqueueMapBuffer( // command_waiting_line, memoryObjects[1], CL_TRUE, CL_MAP_WRITE, 0, // bufferSize, 0, // NULL, NULL, &errorNumber); // mapMemoryObjectsSuccess &= success_verification(errorNumber); if (!mapMemoryObjectsSuccess) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to map buffer. %s:%d\n", __FILE__, __LINE__); return 1; } /* [Map the buffers to pointers] */ /* [Initialize the input data] */ const char *activity_atom_text = "nyistu htoftu hnattu hnamtu"; const uint16_t activity_atom_text_size = (uint16_t)(strlen(activity_atom_text)); const char *quiz_independentClause_list_text = "zrundoka hwindocayu hwindokali" "hwindoka tyutdocayu tyindokali" "tyutdoka tyutdocayu hfutdokali" "tyindoka fwandocayu nyatdokali"; //"bu.hnac.2.hnac.buka bu.hnac.2.hnac.buca yu " //"bu.hnac.4.hnac.bukali"; const uint16_t quiz_independentClause_list_text_size = (uint16_t)strlen(quiz_independentClause_list_text); uint16_t quiz_independentClause_list_size = 4; v16us quiz_independentClause_list[8]; uint16_t text_remainder = 0; // uint16_t program_worth = 0; uint64_t random_seed = 0x0123456789ABCDEF; uint16_t tablet_indexFinger = 0; // uint8_t champion = 0; // uint16_t champion_worth = 0; // v16us program_; // v16us population[4]; memset(quiz_independentClause_list, 0, (size_t)(quiz_independentClause_list_size * TABLET_LONG * WORD_THICK)); text_code(activity_atom_text_size, activity_atom_text, &activity_atom_size, activity_atom, &text_remainder); assert(text_remainder == 0); text_code(quiz_independentClause_list_text_size, quiz_independentClause_list_text, &quiz_independentClause_list_size, quiz_independentClause_list, &text_remainder); /* [Initialize the input data] */ /* [Un-map the buffers] */ /* * Unmap the memory objects as we have finished using them from the CPU side. * We unmap the memory because otherwise: * - reads and writes to that memory from inside a kernel on the OpenCL side * are undefined. * - the OpenCL implementation cannot free the memory when it is finished. */ if (!success_verification( clEnqueueUnmapMemObject(command_waiting_line, memoryObjects[0], activity_atom, 0, NULL, NULL))) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Unmapping memory objects failed %s:%d\n", __FILE__, __LINE__); return 1; } // if (!success_verification(clEnqueueUnmapMemObject(command_waiting_line, // memoryObjects[1], // inputB, 0, NULL, NULL))) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); // cerr << "Unmapping memory objects failed " << __FILE__ << ":" << __LINE__ // << endl; // return 1; //} /* [Un-map the buffers] */ /* [Set the kernel arguments] */ int setKernelArgumentsSuccess = TRUE; printf("arg0\n"); setKernelArgumentsSuccess &= success_verification(clSetKernelArg( kernel, 0, sizeof(uint8_t), (uint8_t *)&activity_atom_size)); printf("arg1\n"); setKernelArgumentsSuccess &= success_verification( clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjects[0])); printf("arg2\n"); setKernelArgumentsSuccess &= success_verification( clSetKernelArg(kernel, 2, sizeof(uint16_t), (uint16_t *)&program_size)); printf("arg3\n"); setKernelArgumentsSuccess &= success_verification( clSetKernelArg(kernel, 3, sizeof(uint8_t), (uint8_t *)&population_size)); printf("arg4\n"); setKernelArgumentsSuccess &= success_verification( clSetKernelArg(kernel, 4, sizeof(uint64_t), (uint64_t *)&random_seed)); printf("arg5\n"); setKernelArgumentsSuccess &= success_verification(clSetKernelArg(kernel, 5, sizeof(uint64_t *), NULL)); printf("arg6\n"); setKernelArgumentsSuccess &= success_verification( clSetKernelArg(kernel, 6, sizeof(cl_mem), &memoryObjects[1])); printf("arg7\n"); setKernelArgumentsSuccess &= success_verification(clSetKernelArg(kernel, 7, sizeof(uint8_t *), NULL)); printf("arg8\n"); setKernelArgumentsSuccess &= success_verification( clSetKernelArg(kernel, 8, sizeof(cl_mem), &memoryObjects[2])); if (!setKernelArgumentsSuccess) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed setting OpenCL kernel arguments. %s:%d\n", __FILE__, __LINE__); return 1; } /* [Set the kernel arguments] */ /* An event to associate with the Kernel. Allows us to retrieve profiling * information later. */ cl_event event = 0; /* [Global work size] */ /* * Each instance of our OpenCL kernel operates on a single element of each * array so the number of * instances needed is the number of elements in the array. */ size_t globalWorksize[1] = {population_size}; size_t localWorksize[1] = {2}; /* Enqueue the kernel */ if (!success_verification(clEnqueueNDRangeKernel( command_waiting_line, kernel, 1, NULL, globalWorksize, localWorksize, 0, NULL, &event))) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed enqueuing the kernel. %s:%d\n", __FILE__, __LINE__); return 1; } /* [Global work size] */ /* Wait for kernel execution completion. */ if (!success_verification(clFinish(command_waiting_line))) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed waiting for kernel execution to finish. %s:%d\n", __FILE__, __LINE__); return 1; } /* Print the profiling information for the event. */ // printProfilingInfo(event); /* Release the event object. */ if (!success_verification(clReleaseEvent(event))) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed releasing the event object. %s:%d\n", __FILE__, __LINE__); return 1; } /* Get a pointer to the output data. */ printf("clOut\n"); v16us *output = (v16us *)clEnqueueMapBuffer( command_waiting_line, memoryObjects[1], CL_TRUE, CL_MAP_READ, 0, population_byte_size, 0, NULL, NULL, &errorNumber); v16us *newspaper = (v16us *)clEnqueueMapBuffer( command_waiting_line, memoryObjects[2], CL_TRUE, CL_MAP_READ, 0, newspaper_byte_size, 0, NULL, NULL, &errorNumber); if (!success_verification(errorNumber)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to map buffer. %s:%d\n", __FILE__, __LINE__); return 1; } /* [Output the results] */ /* Uncomment the following block to print results. */ for (tablet_indexFinger = 0; tablet_indexFinger < (population_size * TABLET_LONG); ++tablet_indexFinger) { if (tablet_indexFinger % 0x10 == 0) printf("\n"); printf("%04X ", (uint)((uint16_t *)output)[tablet_indexFinger]); } printf("\n"); // printf("program %04X \n", (uint)*((uint16_t *)&(output[1]))); printf("newspaper \n"); for (tablet_indexFinger = 0; tablet_indexFinger < (NEWSPAPER_LONG * TABLET_LONG); ++tablet_indexFinger) { if (tablet_indexFinger % 0x10 == 0) printf("\n"); printf("%04X ", (uint)((uint16_t *)newspaper)[tablet_indexFinger]); } printf("\n"); /* [Output the results] */ /* Unmap the memory object as we are finished using them from the CPU side. */ if (!success_verification(clEnqueueUnmapMemObject( command_waiting_line, memoryObjects[1], output, 0, NULL, NULL))) { printf("unmapping\n"); // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Unmapping memory objects failed %s:%d\n", __FILE__, __LINE__); return 1; } if (!success_verification(clEnqueueUnmapMemObject( command_waiting_line, memoryObjects[2], newspaper, 0, NULL, NULL))) { printf("unmapping\n"); // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Unmapping memory objects failed %s:%d\n", __FILE__, __LINE__); return 1; } printf("releasing\n"); /* Release OpenCL objects. */ // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); }
int main(int argc, char** argv) { printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE); cl_int error; cl_uint num_platforms; // Get the number of platforms error = clGetPlatformIDs(0, NULL, &num_platforms); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get the list of platforms cl_platform_id* platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * num_platforms); error = clGetPlatformIDs(num_platforms, platforms, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Print the chosen platform (if there are multiple platforms, choose the first one) cl_platform_id platform = platforms[0]; char pbuf[100]; error = clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Platform: %s\n", pbuf); // Create a GPU context cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) platform, 0}; context = clCreateContextFromType(context_properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Get and print the chosen device (if there are multiple devices, choose the first one) size_t devices_size; error = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &devices_size); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); cl_device_id *devices = (cl_device_id *) malloc(devices_size); error = clGetContextInfo(context, CL_CONTEXT_DEVICES, devices_size, devices, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); device = devices[0]; error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf), pbuf, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); printf("Device: %s\n", pbuf); // Create a command queue command_queue = clCreateCommandQueue(context, device, 0, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); int size; int grid_rows,grid_cols = 0; float *FilesavingTemp,*FilesavingPower; //,*MatrixOut; char *tfile, *pfile, *ofile; int total_iterations = 60; int pyramid_height = 1; // number of iterations if (argc < 7) usage(argc, argv); if((grid_rows = atoi(argv[1]))<=0|| (grid_cols = atoi(argv[1]))<=0|| (pyramid_height = atoi(argv[2]))<=0|| (total_iterations = atoi(argv[3]))<=0) usage(argc, argv); tfile=argv[4]; pfile=argv[5]; ofile=argv[6]; size=grid_rows*grid_cols; // --------------- pyramid parameters --------------- int borderCols = (pyramid_height)*EXPAND_RATE/2; int borderRows = (pyramid_height)*EXPAND_RATE/2; int smallBlockCol = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE; int smallBlockRow = BLOCK_SIZE-(pyramid_height)*EXPAND_RATE; int blockCols = grid_cols/smallBlockCol+((grid_cols%smallBlockCol==0)?0:1); int blockRows = grid_rows/smallBlockRow+((grid_rows%smallBlockRow==0)?0:1); FilesavingTemp = (float *) malloc(size*sizeof(float)); FilesavingPower = (float *) malloc(size*sizeof(float)); // MatrixOut = (float *) calloc (size, sizeof(float)); if( !FilesavingPower || !FilesavingTemp) // || !MatrixOut) fatal("unable to allocate memory"); // Read input data from disk readinput(FilesavingTemp, grid_rows, grid_cols, tfile); readinput(FilesavingPower, grid_rows, grid_cols, pfile); // Load kernel source from file const char *source = load_kernel_source("hotspot_kernel.cl"); size_t sourceSize = strlen(source); // Compile the kernel cl_program program = clCreateProgramWithSource(context, 1, &source, &sourceSize, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); char clOptions[110]; // sprintf(clOptions,"-I../../src"); sprintf(clOptions," "); #ifdef BLOCK_SIZE sprintf(clOptions + strlen(clOptions), " -DBLOCK_SIZE=%d", BLOCK_SIZE); #endif // Create an executable from the kernel error = clBuildProgram(program, 1, &device, clOptions, NULL, NULL); // Show compiler warnings/errors static char log[65536]; memset(log, 0, sizeof(log)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); if (strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); kernel = clCreateKernel(program, "hotspot", &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); long long start_time = get_time(); // Create two temperature matrices and copy the temperature input data cl_mem MatrixTemp[2]; // Create input memory buffers on device MatrixTemp[0] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(float) * size, FilesavingTemp, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Lingjie Zhang modifited at Nov 1, 2015 //MatrixTemp[1] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(float) * size, NULL, &error); MatrixTemp[1] = clCreateBuffer(context, CL_MEM_READ_WRITE , sizeof(float) * size, NULL, &error); // end Lingjie Zhang modification if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Copy the power input data cl_mem MatrixPower = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(float) * size, FilesavingPower, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); // Perform the computation int ret = compute_tran_temp(MatrixPower, MatrixTemp, grid_cols, grid_rows, total_iterations, pyramid_height, blockCols, blockRows, borderCols, borderRows, FilesavingTemp, FilesavingPower); // Copy final temperature data back cl_float *MatrixOut = (cl_float *) clEnqueueMapBuffer(command_queue, MatrixTemp[ret], CL_TRUE, CL_MAP_READ, 0, sizeof(float) * size, 0, NULL, NULL, &error); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); long long end_time = get_time(); printf("Total time: %.3f seconds\n", ((float) (end_time - start_time)) / (1000*1000)); // Write final output to output file writeoutput(MatrixOut, grid_rows, grid_cols, ofile); error = clEnqueueUnmapMemObject(command_queue, MatrixTemp[ret], (void *) MatrixOut, 0, NULL, NULL); if (error != CL_SUCCESS) fatal_CL(error, __LINE__); clReleaseMemObject(MatrixTemp[0]); clReleaseMemObject(MatrixTemp[1]); clReleaseMemObject(MatrixPower); clReleaseContext(context); return 0; }
int main() { /* OpenCL data structures */ cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_int i, j, err; /* Data and buffers */ float data_one[100], data_two[100], result_array[100]; cl_mem buffer_one, buffer_two; void* mapped_memory; /* Initialize arrays */ for(i=0; i<100; i++) { data_one[i] = 1.0f*i; data_two[i] = -1.0f*i; result_array[i] = 0.0f; } /* Create a device and context */ device = create_device(); context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Build the program and create the kernel */ program = build_program(context, device, PROGRAM_FILE); kernel = clCreateKernel(program, KERNEL_FUNC, &err); if(err < 0) { perror("Couldn't create a kernel"); exit(1); }; /* Create buffers */ buffer_one = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(data_one), data_one, &err); if(err < 0) { perror("Couldn't create a buffer object"); exit(1); } buffer_two = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(data_two), data_two, NULL); /* Set buffers as arguments to the kernel */ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer_one); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &buffer_two); if(err < 0) { perror("Couldn't set the buffer as the kernel argument"); exit(1); } /* Create a command queue */ queue = clCreateCommandQueue(context, device, 0, &err); if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; /* Enqueue kernel */ err = clEnqueueTask(queue, kernel, 0, NULL, NULL); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } /* Enqueue command to copy buffer one to buffer two */ err = clEnqueueCopyBuffer(queue, buffer_one, buffer_two, 0, 0, sizeof(data_one), 0, NULL, NULL); if(err < 0) { perror("Couldn't perform the buffer copy"); exit(1); } /* Enqueue command to map buffer two to host memory */ mapped_memory = clEnqueueMapBuffer(queue, buffer_two, CL_TRUE, CL_MAP_READ, 0, sizeof(data_two), 0, NULL, NULL, &err); if(err < 0) { perror("Couldn't map the buffer to host memory"); exit(1); } /* Transfer memory and unmap the buffer */ memcpy(result_array, mapped_memory, sizeof(data_two)); err = clEnqueueUnmapMemObject(queue, buffer_two, mapped_memory, 0, NULL, NULL); if(err < 0) { perror("Couldn't unmap the buffer"); exit(1); } /* Display updated buffer */ for(i=0; i<10; i++) { for(j=0; j<10; j++) { printf("%6.1f", result_array[j+i*10]); } printf("\n"); } /* Deallocate resources */ clReleaseMemObject(buffer_one); clReleaseMemObject(buffer_two); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }
float sgemmMain(int rowa,int cola,int colb) { cl_context context = 0; cl_command_queue commandQueue = 0; cl_program program = 0; cl_device_id device = 0; cl_kernel kernel = 0; const unsigned int numberOfMemoryObjects = 3; cl_mem memoryObjectsa = 0; cl_mem memoryObjectsb = 0; cl_mem memoryObjectsc = 0; cl_int errorNumber; cl_uint clrowa = rowa; cl_uint clcola = cola; cl_uint clcolb = colb; int err; err = createContext(&context); LOGD("create context"); err = createCommandQueue(context, &commandQueue, &device); err = createProgram(context, device, "/mnt/sdcard/kernel/sgemm.cl", &program); kernel = clCreateKernel(program, "sgemm", &errorNumber); LOGD("createKernel code %d",errorNumber); LOGD("start computing"); float alpha = 1; float beta = 0.1; /* Create the matrices. */ size_t matrixSizea = rowa * cola; size_t matrixSizeb = cola * colb; size_t matrixSizec = rowa * colb; /* As all the matrices have the same size, the buffer size is common. */ size_t bufferSizea = matrixSizea * sizeof(float); size_t bufferSizeb = matrixSizeb * sizeof(float); size_t bufferSizec = matrixSizec * sizeof(float); /* Create buffers for the matrices used in the kernel. */ int createMemoryObjectsSuccess = 0; memoryObjectsa = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSizea, NULL, &errorNumber); createMemoryObjectsSuccess &= errorNumber; memoryObjectsb = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSizeb, NULL, &errorNumber); createMemoryObjectsSuccess &= errorNumber; memoryObjectsc = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, bufferSizec, NULL, &errorNumber); createMemoryObjectsSuccess &= errorNumber; LOGD("create memory err %d",createMemoryObjectsSuccess); int mapMemoryObjectsSuccess = 0; cl_float* matrixA = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsa, CL_TRUE, CL_MAP_WRITE, 0, bufferSizea, 0, NULL, NULL, &errorNumber); mapMemoryObjectsSuccess &= errorNumber; cl_float* matrixB = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsb, CL_TRUE, CL_MAP_WRITE, 0, bufferSizeb, 0, NULL, NULL, &errorNumber); mapMemoryObjectsSuccess &= errorNumber; cl_float* matrixC = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsc, CL_TRUE, CL_MAP_WRITE, 0, bufferSizec, 0, NULL, NULL, &errorNumber); mapMemoryObjectsSuccess &= errorNumber; LOGD("map memory err %d",mapMemoryObjectsSuccess); sgemmInitialize(rowa,cola,colb, matrixA, matrixB, matrixC); LOGD("data initial finish"); int unmapMemoryObjectsSuccess = 0; errorNumber = clEnqueueUnmapMemObject(commandQueue, memoryObjectsa, matrixA, 0, NULL, NULL); LOGD("memory code %d",errorNumber); unmapMemoryObjectsSuccess &= errorNumber; errorNumber = clEnqueueUnmapMemObject(commandQueue, memoryObjectsb, matrixB, 0, NULL, NULL); LOGD("memory code %d",errorNumber); unmapMemoryObjectsSuccess &= errorNumber; errorNumber = clEnqueueUnmapMemObject(commandQueue, memoryObjectsc, matrixC, 0, NULL, NULL); LOGD("memory code %d",errorNumber); unmapMemoryObjectsSuccess &= errorNumber; LOGD("unmap memory err %d",unmapMemoryObjectsSuccess); int setKernelArgumentsSuccess = 0; errorNumber = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memoryObjectsa); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjectsb); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 2, sizeof(cl_mem), &memoryObjectsc); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 3, sizeof(cl_uint), &clrowa); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 4, sizeof(cl_uint), &clcola); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 5, sizeof(cl_uint), &clcolb); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 6, sizeof(cl_float), &alpha); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 7, sizeof(cl_float), &beta); setKernelArgumentsSuccess &= errorNumber; LOGD("setKernel err %d",setKernelArgumentsSuccess); LOGD("start running kernel"); clock_t start_t,end_t; float cost_time; start_t = clock(); cl_event event = 0; size_t globalWorksize[2] = {rowa, colb}; errorNumber = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalWorksize, NULL, 0, NULL, &event); //LOGD("Enqueue err code %d",errorNumber); errorNumber = clFinish(commandQueue); end_t = clock(); cost_time = (float)(end_t-start_t)/CLOCKS_PER_SEC*1000; LOGD("Finish err code %d",errorNumber); float time; time = printProfilingInfo(event); LOGT("using CPU clock: %f ms",cost_time); LOGT("using GPU clock: %f ms",time); clReleaseEvent(event); matrixC = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsc, CL_TRUE, CL_MAP_READ, 0, bufferSizec, 0, NULL, NULL, &errorNumber); clEnqueueUnmapMemObject(commandQueue, memoryObjectsc, matrixC, 0, NULL, NULL); LOGD("read out matrixC finish"); LOGD("matrixC value C(0,0): %f",matrixC[0]); cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjectsa, memoryObjectsb,memoryObjectsc,numberOfMemoryObjects); LOGD("RUNNING finsh"); return time; }
inline void vector_sum(const int arraySize, const double* inputA, const double* inputB, double* output) { /* Allocate memory buffers */ /* * Ask the OpenCL implementation to allocate buffers for the data. * We ask the OpenCL implemenation to allocate memory rather than * allocating it on the CPU to avoid having to copy the data later. * The read/write flags relate to accesses to the memory from within * the kernel. */ bool createMemoryObjectSuccess = true; int numberOfMemoryObjects = 3; cl_mem memoryObjects[3] = {0, 0, 0}; int errorNumber = 0; int bufferSize = arraySize*sizeof(double); memoryObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bufferSize, (void*)inputA, &errorNumber); checkErr(errorNumber, "Failed to create buffer, 1."); memoryObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bufferSize, (void*)inputB, &errorNumber); checkErr(errorNumber, "Failed to create buffer, 2."); memoryObjects[2] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, bufferSize, output, &errorNumber); checkErr(errorNumber, "Failed to create buffer, 3."); /* Enqueue commands and kernels */ /* Enqueue to the command queues the commands that control the sequence * and synchronization of kernel execution, reading and writing of data, * and manipulation of memory objects */ /* Execute a kernel function */ /* Call clSetKernelArg() for each parameter in the kernel */ bool setKernelArgumentsSuccess = true; setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 0, sizeof(cl_mem), &memoryObjects[0])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjects[1])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 2, sizeof(cl_mem), &memoryObjects[2])); if (not setKernelArgumentsSuccess) { cleanUpOpenCL(); std::cerr << "Failed setting OpenCL kernel arguments. " << __FILE__ << ":"<< __LINE__ << std::endl; exit(1); } /* Determine the work-group size and index space for the kernel */ const size_t globalWorkSize[1] = {arraySize}; const size_t localWorkSize[1] = { 1 }; /* Enqueue the kernel for execution in the command queue */ //for (int j = 0; j < ITER; j++) { if (not checkSuccess(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL))) { cleanUpOpenCL(); std::cerr << "Failed enqueuing the kernel. " << __FILE__ << ":" << __LINE__ <<std::endl; exit(1); } //} /* Get a pointer to the output data */ output = (double*)clEnqueueMapBuffer(commandQueue, memoryObjects[2], CL_TRUE, CL_MAP_READ, 0, arraySize, 0, NULL, NULL, &errorNumber); if (not checkSuccess(errorNumber)) { cleanUpOpenCL(); std::cerr << "Failed to map buffer " << __FILE__ << ":" << __LINE__ << std::endl; exit(1); } /* Wait for kernel execution */ if (not checkSuccess(clFinish(commandQueue))) { cleanUpOpenCL(); std::cerr << "Failed waiting for kernel execution to finish. " << __FILE__ << ":"<< __LINE__ << std::endl; exit(1); } /* Unmap the memory objects as we finished using them in the CPU */ if (not checkSuccess(clReleaseMemObject(memoryObjects[0]))) { cleanUpOpenCL(); std::cerr << "Unmapping memory objects failed " << __FILE__ << ":" << __LINE__ << std::endl; exit(1); } if (not checkSuccess(clReleaseMemObject(memoryObjects[1]))) { cleanUpOpenCL(); std::cerr << "Unmapping memory objects failed " << __FILE__ << ":" << __LINE__ << std::endl; exit(1); } if (not checkSuccess(clEnqueueUnmapMemObject(commandQueue, memoryObjects[2], output, 0, NULL, NULL))) { cleanUpOpenCL(); std::cerr << "Unmapping memory objects failed " << __FILE__ << ":" << __LINE__ << std::endl; exit(1); } }