void xSyrk<float>::roundtrip_func() { timer.Start(timer_id); cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(float), NULL, &err); buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(float), NULL, &err); this->initialize_gpu_buffer(); clblasSsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, 1, &queue_, 0, NULL, NULL); err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE, buffer_.offC_*sizeof(float), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(float), buffer_.c_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); }
void xSyrk<float>:: call_func() { timer.Start(timer_id); clblasSsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, 1, &queue_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); }
ErrorStatus crossprod_clblas(cl_device_id device, void *inMatrix, void *outMatrix, int nrow, int ncol, bool use_float) { std::stringstream result; float *input_matrix_f = (float *)inMatrix; float *output_matrix_f = (float *)outMatrix; double *input_matrix_d = (double *)inMatrix; double *output_matrix_d = (double *)outMatrix; if (debug) { result << "crossprod_clblas( " << (use_float ? "FLOAT" : "DOUBLE") << ", nrow = " << nrow << ", ncol = " << ncol << ")" << std::endl << std::endl; } cl_int err = CL_SUCCESS; clblasStatus status = clblasSetup(); if (status != CL_SUCCESS) { if (debug) { result << "clblasSetup: " << clblasErrorToString(status) << std::endl; } err = CL_INVALID_OPERATION; } // get first platform cl_platform_id platform = NULL; if (err == CL_SUCCESS) { err = clGetPlatformIDs(1, &platform, NULL); } if (debug && err == CL_SUCCESS) { result << "Platform: " << getPlatformInfoString(platform, CL_PLATFORM_NAME) << std::endl; result << "Device: " << getDeviceInfoString(device, CL_DEVICE_NAME) << std::endl; } // context cl_context context = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateContext:" << std::endl; } context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); } // queue cl_command_queue queue = NULL; if (err == CL_SUCCESS) { #ifdef CL_VERSION_2_0 if (debug) { result << "clCreateCommandQueueWithProperties:" << std::endl; } queue = clCreateCommandQueueWithProperties(context, device, NULL, &err); #else if (debug) { result << "clCreateCommandQueue:" << std::endl; } queue = clCreateCommandQueue(context, device, 0, &err); #endif } // buffers cl_mem cl_input_matrix = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_input_matrix:" << std::endl; } if (use_float) { cl_input_matrix = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrow * ncol * sizeof(float), input_matrix_f, &err); } else { cl_input_matrix = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrow * ncol * sizeof(double), input_matrix_d, &err); } } cl_mem cl_output_matrix = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_output_vector:" << std::endl; } if (use_float) { cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, ncol * ncol * sizeof(float), output_matrix_f, &err); } else { cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, ncol * ncol * sizeof(double), output_matrix_d, &err); } } // ++++++++++++ const clblasOrder order = clblasColumnMajor; const clblasTranspose transA = clblasTrans; const size_t lda = nrow; const size_t ldc = ncol; const cl_float alpha = 1.0; clblasUplo uplo = clblasUpper; cl_event event = NULL; if (err == CL_SUCCESS) { if (use_float) { if (debug) { result << "clblasSsyrk:" << std::endl; } status = clblasSsyrk(order, uplo, transA, ncol, nrow, alpha, cl_input_matrix, 0, lda, 0.0, cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS && debug) { result << "clblasSgemm error:" << clblasErrorToString(status) << std::endl; } } else { if (debug) { result << "clblasDsyrk:" << std::endl; } status = clblasDsyrk(order, uplo, transA, ncol, nrow, alpha, cl_input_matrix, 0, lda, 0.0, cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { if (debug) { result << "clblasDgemm error:" << clblasErrorToString(status) << std::endl; } err = status; } } } if (err == CL_SUCCESS) { /* Wait for calculations to be finished. */ if (debug) { result << "clWaitForEvents:" << std::endl; } err = clWaitForEvents(1, &event); } // retrieve result if (err == CL_SUCCESS) { if (debug) { result << "Retrieve result:" << std::endl; } if (use_float) { clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, ncol * ncol * sizeof(float), output_matrix_f, 0, NULL, NULL); symmetrizeSquare_f(output_matrix_f, ncol); } else { clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, ncol * ncol * sizeof(double), output_matrix_d, 0, NULL, NULL); symmetrizeSquare_d(output_matrix_d, ncol); } } std::string err_str = clErrorToString(err); result << std::endl << err_str << std::endl; // cleanup clReleaseMemObject(cl_output_matrix); cl_output_matrix = NULL; clReleaseMemObject(cl_input_matrix); cl_input_matrix = NULL; clReleaseCommandQueue(queue); queue = NULL; clReleaseContext(context); context = NULL; if (debug) { CERR << result.str(); } ErrorStatus errorStatus = { err, status }; // return status != CL_SUCCESS ? clblasErrorToString(status) : clErrorToString(err); return errorStatus; }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufC; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * K * sizeof(*A), NULL, &err); bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * N * sizeof(*C), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * K * sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, N * N * sizeof(*C), C, 0, NULL, NULL); /* Call clblas extended function. Perform SYRK for the lower right sub-matrices */ err = clblasSsyrk(order, uplo, transA, N - off, K - off, alpha, bufA, offA, lda, beta, bufC, offC, ldc, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSsyrkEx() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, N * N * sizeof(*result), result, 0, NULL, NULL); /* At this point you will get the result of SSYRK placed in 'result' array. */ puts(""); printResult("clblasSsyrkEx result"); } /* Release OpenCL memory objects. */ clReleaseMemObject(bufC); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }