template <typename ElemType> nano_time_t DotPerformanceTest<ElemType>::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; DataType type; type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE: ( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; event = NULL; clFinish( queue); time = getCurrentTime(); #define TIMING #ifdef TIMING int iter = 100; for ( int i=1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::dot( type, params_.N, mobjDP_, params_.offa, mobjX_, params_.offBX, params_.incx, mobjY_, params_.offCY, params_.incy, scratchBuff, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS DOT function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; }
template <typename ElemType> nano_time_t Syr2kPerformanceTest<ElemType>::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0, params_.rowsC * params_.columnsC * sizeof(ElemType), backC_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix C buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; status = (cl_int)clMath::clblas::syr2k(params_.order, params_.uplo, params_.transA, params_.N, params_.K, alpha_, mobjA_, params_.offA, params_.lda, mobjB_, params_.offBX, params_.ldb, beta_, mobjC_, params_.offCY, params_.ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS SYR2K function failed, status = " << status << endl; return NANOTIME_ERR; } status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } return time; }
void herkCorrectnessTest(TestParams *params) { cl_int err; T *A, *blasC, *clblasC; T alpha, beta; cl_mem bufA, bufC; clMath::BlasBase *base; bool useAlpha; bool useBeta; cl_event *events; if (params->transA == clblasTrans) { ::std::cerr << ">> herk(TRANSPOSE) for complex numbers " "is not allowed." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } base = clMath::BlasBase::getInstance(); alpha = ZERO<T>(); beta = ZERO<T>(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); A = new T[params->rowsA * params->columnsA]; blasC = new T[params->rowsC * params->columnsC]; clblasC = new T[params->rowsC * params->columnsC]; if((A == NULL) || (blasC == NULL) || (clblasC == NULL)) { deleteBuffers<T>(A, blasC, clblasC); ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; delete[] events; SUCCEED(); return; } srand(params->seed); useAlpha = true; useBeta = true; alpha = convertMultiplier<T>(params->alpha); beta = convertMultiplier<T>(params->beta); randomGemmMatrices<T>(params->order, params->transA, clblasNoTrans, params->N, params->N, params->K, useAlpha, &alpha, A, params->lda, NULL, 0, useBeta, &beta, blasC, params->ldc); memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC)); if (params->order == clblasColumnMajor) { ::clMath::blas::herk(clblasColumnMajor, params->uplo, params->transA, params->N, params->K, CREAL(alpha), A, params->lda, CREAL(beta), blasC, params->ldc); } else { /* T *reorderedA = new T[params->rowsA * params->columnsA]; T *reorderedC = new T[params->rowsC * params->columnsC]; reorderMatrix<T>(clblasRowMajor, params->rowsA, params->columnsA, A, reorderedA); reorderMatrix<T>(clblasRowMajor, params->rowsC, params->columnsC, blasC, reorderedC); ::clMath::blas::herk(clblasColumnMajor, params->uplo, params->transA, params->N, params->K, CREAL(alpha), reorderedA, params->rowsA, CREAL(beta), reorderedC, params->rowsC); reorderMatrix<T>(clblasColumnMajor, params->rowsC, params->columnsC, reorderedC, blasC); delete[] reorderedC; delete[] reorderedA; */ clblasTranspose fTransA = (params->transA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans; clblasUplo fUplo = (params->uplo == clblasUpper) ? clblasLower : clblasUpper; ::clMath::blas::herk(clblasColumnMajor, fUplo, fTransA, params->N, params->K, CREAL(alpha), A, params->lda, CREAL(beta), blasC, params->ldc); } bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), CL_MEM_READ_ONLY); bufC = base->createEnqueueBuffer(clblasC, params->rowsC * params->columnsC * sizeof(*clblasC), params->offCY * sizeof(*clblasC), CL_MEM_READ_WRITE); if ((bufA == NULL) || (bufC == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufC); deleteBuffers<T>(A, blasC, clblasC); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } err = (cl_int)::clMath::clblas::herk(params->order, params->uplo, params->transA, params->N, params->K, CREAL(alpha), bufA, params->offA, params->lda, CREAL(beta), bufC, params->offCY, params->ldc, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufC); deleteBuffers<T>(A, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HERK() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufC); deleteBuffers<T>(A, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, params->offCY * sizeof(*clblasC), params->rowsC * params->columnsC * sizeof(*clblasC), clblasC, 0, NULL, NULL); releaseMemObjects(bufA, bufC); compareMatrices<T>(params->order, params->N, params->N, blasC, clblasC, params->ldc); if (::testing::Test::HasFailure()) { printTestParams(params->order, params->uplo, params->transA, params->N, params->K, true, params->alpha, params->offA, params->lda, true, params->beta, params->offCY, params->ldc); ::std::cerr << "seed = " << params->seed << ::std::endl; ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; } deleteBuffers<T>(A, blasC, clblasC); delete[] events; }
void spmvCorrectnessTest(TestParams *params) { cl_int err; T *AP, *X, *blasY, *clblasY; cl_mem bufAP, bufX, bufY; clMath::BlasBase *base; cl_event *events; T alpha, beta; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthA = (params->N * (params->N + 1)) / 2; size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); size_t lengthY = (1 + ((params->N -1) * abs(params->incy))); AP = new T[lengthA + params->offA ]; X = new T[lengthX + params->offBX ]; blasY = new T[lengthY + params->offCY ]; clblasY = new T[lengthY + params->offCY ]; srand(params->seed); ::std::cerr << "Generating input data... "; if((AP == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL)) { deleteBuffers<T>(AP, X, blasY, clblasY); ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; delete[] events; SUCCEED(); return; } alpha = convertMultiplier<T>(params->alpha); beta = convertMultiplier<T>(params->beta); randomSpmvMatrices(params->order, params->uplo, params->N, true, &alpha, (AP + params->offA), (X + params->offBX), params->incx, true, &beta, (blasY + params->offCY), params->incy); // Copy blasY to clblasY memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufAP = base->createEnqueueBuffer(AP, (lengthA + params->offA)* sizeof(*AP), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE); ::std::cerr << "Calling reference xSPMV routine... "; clblasOrder order; clblasUplo fUplo; order = params->order; fUplo = params->uplo; if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params->uplo == clblasUpper)? clblasLower : clblasUpper; } ::clMath::blas::spmv( order, fUplo, params->N, alpha, AP, params->offA, X, params->offBX, params->incx, beta, blasY, params->offCY, params->incy); ::std::cerr << "Done" << ::std::endl; if ((bufAP == NULL) || (bufX == NULL) || (bufY == NULL)) { // Skip the test, the most probable reason is // matrix too big for a device. releaseMemObjects(bufAP, bufX, bufY); deleteBuffers<T>(AP, X, blasY, clblasY); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xSPMV routine... "; err = (cl_int)::clMath::clblas::spmv(params->order, params->uplo, params->N, alpha, bufAP, params->offA, bufX, params->offBX, params->incx, beta, bufY, params->offCY, params->incy, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX, bufY); deleteBuffers<T>(AP, X, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SPMV() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX, bufY); deleteBuffers<T>(AP, X, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "SPMV: Reading results failed...." << std::endl; } releaseMemObjects(bufAP, bufX, bufY); compareMatrices<T>(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY), lengthY); deleteBuffers<T>(AP, X, blasY, clblasY); delete[] events; }
void her2kCorrectnessTest(TestParams *params) { cl_int err; T *A, *B, *blasC, *clblasC; T alpha, beta; cl_mem bufA, bufC, bufB; clMath::BlasBase *base; cl_event *events; if (params->transA == clblasTrans) { ::std::cerr << ">> her2k(TRANSPOSE) for complex numbers " "is not allowed." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); A = new T[params->rowsA * params->columnsA]; B = new T[params->rowsB * params->columnsB]; blasC = new T[params->rowsC * params->columnsC]; clblasC = new T[params->rowsC * params->columnsC]; if((A == NULL) || (B == NULL) || (blasC == NULL) || (clblasC == NULL)) { deleteBuffers<T>(A, B, blasC, clblasC); ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; delete[] events; SUCCEED(); return; } srand(params->seed); alpha = convertMultiplier<T>(params->alpha); beta = convertMultiplier<T>(params->beta); ::std::cerr << "Generating input data... "; clblasTranspose ftransB = (params->transA==clblasNoTrans)? clblasConjTrans: clblasNoTrans; randomGemmMatrices<T>(params->order, params->transA, ftransB, params->N, params->N, params->K, true, &alpha, A, params->lda, B, params->ldb, true, &beta, blasC, params->ldc); memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC)); ::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), CL_MEM_READ_ONLY); bufB = base->createEnqueueBuffer(B, params->rowsB * params->columnsB * sizeof(*B), params->offBX * sizeof(*B), CL_MEM_READ_ONLY); bufC = base->createEnqueueBuffer(clblasC, params->rowsC * params->columnsC * sizeof(*clblasC), params->offCY * sizeof(*clblasC), CL_MEM_READ_WRITE); if ((bufA == NULL) || (bufB == NULL)|| (bufC == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufB, bufC); deleteBuffers<T>(A, B, blasC, clblasC); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling reference xHER2K routine... "; T fAlpha = alpha; if (params->order == clblasColumnMajor) { ::clMath::blas::her2k(clblasColumnMajor, params->uplo, params->transA, params->N, params->K, fAlpha, A, 0, params->lda, B, 0, params->ldb, CREAL(beta), blasC, 0, params->ldc); } else { CIMAG( fAlpha ) *= -1.0; // According to netlib C- interface clblasTranspose fTransA = (params->transA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans; clblasUplo fUplo = (params->uplo == clblasUpper) ? clblasLower : clblasUpper; ::clMath::blas::her2k(clblasColumnMajor, fUplo, fTransA, params->N, params->K, fAlpha, A, 0, params->lda, B, 0, params->ldb, CREAL(beta), blasC, 0, params->ldc); } ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling clblas xHER2K routine... "; err = (cl_int)::clMath::clblas::her2k(params->order, params->uplo, params->transA, params->N, params->K, alpha, bufA, params->offA, params->lda, bufB, params->offBX, params->ldb, CREAL(beta), bufC, params->offCY, params->ldc, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers<T>(A, B, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HER2K() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers<T>(A, B, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, params->offCY * sizeof(*clblasC), params->rowsC * params->columnsC * sizeof(*clblasC), clblasC, 0, NULL, NULL); releaseMemObjects(bufA, bufB, bufC); compareMatrices<T>(params->order, params->N, params->N, blasC, clblasC, params->ldc); deleteBuffers<T>(A, B, blasC, clblasC); delete[] events; }
template <typename ElemType> nano_time_t TrsvPerformanceTest<ElemType>::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; size_t lenX = 1 + ((params_.N-1) * abs(params_.incx)) + params_.offBX; status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0, lenX * sizeof(ElemType), backX_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector X buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; DataType type; type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE: ( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; time = getCurrentTime(); #define TIMING #ifdef TIMING clFinish( queue); int iter = 20; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::trsv(type, params_.order, params_.uplo, params_.transA, params_.diag, params_.N, mobjA_, params_.offa, params_.lda, mobjX_, params_.offBX, params_.incx, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS TRSV function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; }
void gemm2CorrectnessTest(TestParams *params) { cl_int err; T *A, *B, *blasC, *clblasC; T alpha, beta; cl_mem bufA, bufB, bufC; clMath::BlasBase *base; bool useAlpha; bool useBeta; cl_event *events; base = clMath::BlasBase::getInstance(); useAlpha = base->useAlpha(); useBeta = base->useBeta(); alpha = ZERO<T>(); beta = ZERO<T>(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); A = new T[params->rowsA * params->columnsA]; B = new T[params->rowsB * params->columnsB]; blasC = new T[params->rowsC * params->columnsC]; clblasC = new T[params->rowsC * params->columnsC]; if((A == NULL) || (B == NULL) || (blasC == NULL) || (clblasC == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(A, B, blasC, clblasC); SUCCEED(); return; } srand(params->seed); if (useAlpha) { alpha = convertMultiplier<T>(params->alpha); } if (useBeta) { beta = convertMultiplier<T>(params->beta); } ::std::cerr << "Generating input data... "; randomGemmMatrices<T>(params->order, params->transA, params->transB, params->M, params->N, params->K, useAlpha, &alpha, A, params->lda, B, params->ldb, useBeta, &beta, blasC, params->ldc); memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC)); ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling reference xGEMM routine... "; if (params->order == clblasColumnMajor) { ::clMath::blas::gemm(clblasColumnMajor, params->transA, params->transB, params->M, params->N, params->K, alpha, A, params->lda, B, params->ldb, beta, blasC, params->ldc); } else { T *reorderedA = new T[params->rowsA * params->columnsA]; T *reorderedB = new T[params->rowsB * params->columnsB]; T *reorderedC = new T[params->rowsC * params->columnsC]; if((reorderedA == NULL) || (reorderedB == NULL) || (reorderedC == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; SUCCEED(); return; } reorderMatrix<T>(clblasRowMajor, params->rowsA, params->columnsA, A, reorderedA); reorderMatrix<T>(clblasRowMajor, params->rowsB, params->columnsB, B, reorderedB); reorderMatrix<T>(clblasRowMajor, params->rowsC, params->columnsC, blasC, reorderedC); ::clMath::blas::gemm(clblasColumnMajor, params->transA, params->transB, params->M, params->N, params->K, alpha, reorderedA, params->rowsA, reorderedB, params->rowsB, beta, reorderedC, params->rowsC); reorderMatrix<T>(clblasColumnMajor, params->rowsC, params->columnsC, reorderedC, blasC); delete[] reorderedC; delete[] reorderedB; delete[] reorderedA; } ::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), CL_MEM_READ_ONLY); bufB = base->createEnqueueBuffer(B, params->rowsB * params->columnsB * sizeof(*B), params->offBX * sizeof(*B), CL_MEM_READ_ONLY); bufC = base->createEnqueueBuffer(clblasC, params->rowsC * params->columnsC * sizeof(*clblasC), params->offCY * sizeof(*clblasC), CL_MEM_READ_WRITE); if ((bufA == NULL) || (bufB == NULL) || (bufC == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufB, bufC); deleteBuffers<T>(A, B, blasC, clblasC); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xGEMM routine... "; err = (cl_int)::clMath::clblas::gemm2(params->order, params->transA, params->transB, params->M, params->N, params->K, alpha, bufA, params->offA, params->lda, bufB, params->offBX, params->ldb, beta, bufC, params->offCY, params->ldc, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers<T>(A, B, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::GEMM() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers<T>(A, B, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, params->offCY * sizeof(*clblasC), params->rowsC * params->columnsC * sizeof(*clblasC), clblasC, 0, NULL, NULL); releaseMemObjects(bufA, bufB, bufC); compareMatrices<T>(params->order, params->M, params->N, blasC, clblasC, params->ldc); deleteBuffers<T>(A, B, blasC, clblasC); delete[] events; }
template <typename ElemType> nano_time_t HprPerformanceTest<ElemType>::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjAP_, CL_TRUE, 0, ((( params_.N*( params_.N + 1 ) )/2 ) + params_.offa) * sizeof(ElemType), backAP_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix A buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; #define TIMING #ifdef TIMING clFinish( queue); time = getCurrentTime(); int iter = 20; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::hpr(params_.order, params_.uplo, params_.N, CREAL(alpha_), mobjX_, params_.offBX, params_.incx, mobjAP_, params_.offa, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS HPR function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; }
template <typename ElemType> nano_time_t ScalPerformanceTest<ElemType>::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; bool is_css_zds = (params_.K == 1)? true: false; // K indicates csscal/zdscal status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0, (lengthX + params_.offBX) * sizeof(ElemType), backX_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix A buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; time = getCurrentTime(); #define TIMING #ifdef TIMING clFinish( queue); int iter = 50; for ( int i=1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::scal(is_css_zds, params_.N, alpha_, mobjX_, params_.offBX, params_.incx, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS SCAL function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; }
template <typename ElemType> nano_time_t RotPerformanceTest<ElemType>::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; //DataType type; //type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT: TYPE_DOUBLE; status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0, (lengthx + params_.offa) * sizeof(ElemType), X_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector X buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0, (lengthy + params_.offb) * sizeof(ElemType), Y_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector Y buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; time = getCurrentTime(); #define TIMING #ifdef TIMING clFinish( queue); int iter = 50; for ( int i=1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::rot(params_.N, mobjX_, params_.offa, params_.incx, mobjY_, params_.offb, params_.incy, alpha, beta, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS ROT function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; }
void syrCorrectnessTest(TestParams *params) { cl_int err; T *blasA, *clblasA, *X; // T *tempA; cl_mem bufA, bufX; clMath::BlasBase *base; cl_event *events; bool useAlpha; T alpha; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthA = params->N * params->lda; size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); blasA = new T[lengthA + params->offa ]; clblasA = new T[lengthA + params->offa ]; X = new T[lengthX + params->offBX ]; // tempA = new T[lengthA + params->offa ]; srand(params->seed); ::std::cerr << "Generating input data... "; memset(blasA, -1, (lengthA + params->offa)); memset(clblasA, -1, (lengthA + params->offa)); memset(X, -1, (lengthX + params->offBX)); alpha = convertMultiplier<T>(params->alpha); useAlpha = true; #ifdef DEBUG_SYR printf("ALPHA in CORR_SYR.CPP %f\n", alpha); #endif if((blasA == NULL) || (X == NULL) || (clblasA == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers<T>(blasA, clblasA, X); delete[] events; SUCCEED(); return; } randomSyrMatrices<T>(params->order, params->uplo, params->N, useAlpha, &alpha, (blasA + params->offa), params->lda, (X + params->offBX), params->incx); /* // Set data in A and X using populate() routine int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT; // Default is Column-Major creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); creationFlags = ( (params-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY); BlasRoutineID BlasFn = CLBLAS_SYR; // Populate A and blasX populate( blasA + params->offa, params-> N, params-> N, params-> lda, BlasFn, creationFlags); populate( X , (lengthX + params->offBX), 1, (lengthX + params->offBX), BlasFn); */ // Copy blasA to clblasA memcpy(clblasA, blasA, (lengthA + params->offa)* sizeof(*blasA)); // memcpy(tempA, blasA, (lengthA + params->offa)* sizeof(*blasA)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufA = base->createEnqueueBuffer(clblasA, (lengthA + params->offa) * sizeof(*clblasA), 0, CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); ::std::cerr << "Calling reference xSYR routine... "; clblasOrder order; clblasUplo fUplo; order = params->order; fUplo = params->uplo; //printf("\n\n before acml call\nA\n"); // printMatrixBlock( params->order, 0, 0, params->N, params->N, params->lda, blasA); //printf("\nX\n"); //printMatrixBlock( clblasColumnMajor, 0, 0, lengthX, 1, lengthX, X); if (order == clblasColumnMajor) { ::clMath::blas::syr( clblasColumnMajor, fUplo, params->N, alpha, X, params->offBX, params->incx, blasA, params->offa, params->lda); } else { T *reorderedA = new T[lengthA + params->offa]; //reorderMatrix<T>(clblasRowMajor, params->N, params->lda, blasA, reorderedA); fUplo = (fUplo == clblasUpper) ? clblasLower : clblasUpper; //::clMath::blas::syr( clblasColumnMajor, fUplo, params->N, alpha, X, params->offBX, params->incx, reorderedA, params->offa, params->lda); ::clMath::blas::syr( clblasColumnMajor, fUplo, params->N, alpha, X, params->offBX, params->incx, blasA, params->offa, params->lda); //reorderMatrix<T>(clblasColumnMajor, params->lda, params->N, reorderedA, blasA); delete[] reorderedA; } //printf("After acml\n"); //printMatrixBlock( params->order, 0, 0, params->N, params->N, params->lda, blasA); ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufX == NULL) ) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufX); deleteBuffers<T>(blasA, clblasA, X); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xSYR routine... "; err = (cl_int)::clMath::clblas::syr( params->order, params->uplo, params->N, alpha, bufX, params->offBX, params->incx, bufA, params->offa, params->lda, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX); deleteBuffers<T>(blasA, clblasA, X); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SYR() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX); deleteBuffers<T>(blasA, clblasA, X); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufA, CL_TRUE, 0, (lengthA + params->offa) * sizeof(*clblasA), clblasA, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "SYR: Reading results failed...." << std::endl; } releaseMemObjects(bufA, bufX); //printMatrixBlock( params->order, 0, 0, params->N, params->N, params->lda, clblasA); //getchar(); // printf("Comparing with the temp buffer\n"); // compareMatrices<T>(clblasColumnMajor, 1, (params->lda - params->N), (blasA + params->offa + params->N), (tempA + params->offa + params->N), // params->lda); // delete[] tempA; printf("Comparing the results\n"); compareMatrices<T>(params->order, params->N , params->N, (blasA + params->offa), (clblasA + params->offa), params->lda); deleteBuffers<T>(blasA, clblasA, X); delete[] events; }
void nrm2CorrectnessTest(TestParams *params) { cl_int err; T1 *blasX; T2 *clblasNRM2, *blasNRM2; cl_mem bufX, bufNRM2, scratchBuff; clMath::BlasBase *base; cl_event *events; cl_double deltaForType = 0.0; base = clMath::BlasBase::getInstance(); if ((typeid(T1) == typeid(cl_double) || typeid(T1) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); blasX = new T1[lengthX + params->offBX ]; blasNRM2 = new T2[1]; clblasNRM2 = new T2[1 + params->offa]; if((blasX == NULL) || (clblasNRM2 == NULL) || (blasNRM2 == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers<T1>(blasX); deleteBuffers<T2>(blasNRM2, clblasNRM2); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; randomVectors<T1>(params->N, (blasX + params->offBX), params->incx, (T1*)NULL, 0, true); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE); bufNRM2 = base->createEnqueueBuffer(NULL, (1 + params->offa) * sizeof(T2), 0, CL_MEM_READ_WRITE); scratchBuff = base->createEnqueueBuffer(NULL, (lengthX * 2 * sizeof(T1)), 0, CL_MEM_READ_WRITE); ::std::cerr << "Calling reference xNRM2 routine... "; *blasNRM2 = ::clMath::blas::nrm2( params->N, blasX, params->offBX, params->incx); ::std::cerr << "Done" << ::std::endl; if ((bufX == NULL) || (bufNRM2 == NULL) || (scratchBuff == NULL)) { releaseMemObjects(bufX, bufNRM2, scratchBuff); deleteBuffers<T1>(blasX); deleteBuffers<T2>(blasNRM2, clblasNRM2); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xNRM2 routine... "; DataType type; type = ( typeid(T1) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T1) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T1) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE; err = (cl_int)::clMath::clblas::nrm2( type, params->N, bufNRM2, params->offa, bufX, params->offBX, params->incx, scratchBuff, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufNRM2, scratchBuff); deleteBuffers<T1>(blasX); deleteBuffers<T2>(blasNRM2, clblasNRM2); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::NRM2() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufNRM2, scratchBuff); deleteBuffers<T1>(blasX); deleteBuffers<T2>(blasNRM2, clblasNRM2); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufNRM2, CL_TRUE, 0, (1 + params->offa) * sizeof(*clblasNRM2), clblasNRM2, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "NRM2: Reading results failed...." << std::endl; } releaseMemObjects(bufX, bufNRM2, scratchBuff); deltaForType = DELTA_0<T1>(); // Since every element of X encounters a division, delta would be sum of deltas for every element in X cl_double delta = 0; for(unsigned int i=0; i<(params->N); i++) { delta += deltaForType * returnMax<T1>(blasX[params->offBX + i]); } compareValues<T2>( (blasNRM2), (clblasNRM2+params->offa), delta); deleteBuffers<T1>(blasX); deleteBuffers<T2>(blasNRM2, clblasNRM2); delete[] events; }
template <typename ElemType> nano_time_t HpmvPerformanceTest<ElemType>::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; int lenY = 1 + (params_.N-1) * abs(params_.incy); status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0, (lenY + params_.offCY )* sizeof(ElemType), backY_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector Y buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; time = getCurrentTime(); #define TIMING #ifdef TIMING int iter = 20; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::hpmv(params_.order, params_.uplo, params_.N, alpha, mobjAP_, params_.offA, mobjX_, params_.offBX, params_.incx, beta, mobjY_, params_.offCY, params_.incy, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS HPMV function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } //printf("Time elapsed : %lu\n", time); #endif return time; }
void tpmvCorrectnessTest(TestParams *params) { cl_int err; T *AP, *blasX, *clblasX; cl_mem bufAP, bufX, bufXTemp; clMath::BlasBase *base; cl_event *events; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthAP = (params->N *( params->N + 1 ))/2 ; size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); AP = new T[lengthAP + params->offa ]; blasX = new T[lengthX + params->offBX ]; clblasX = new T[lengthX + params->offBX ]; if((AP == NULL) || (blasX == NULL) || (clblasX == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers<T>(AP, blasX, clblasX); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; // Set data in A and X using populate() routine int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT | PACKED_MATRIX; // Default is Column-Major creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); creationFlags = ( (params-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY); BlasRoutineID BlasFn = CLBLAS_TRMV; // Populate A and blasX populate( AP + params->offa, params-> N, params-> N, 0, BlasFn, creationFlags); populate( blasX , (lengthX + params->offBX), 1, (lengthX + params->offBX), BlasFn); // Copy blasX to clblasX memcpy(clblasX, blasX, (lengthX + params->offBX)* sizeof(*blasX)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufAP = base->createEnqueueBuffer(AP, (lengthAP + params->offa)* sizeof(*AP), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(clblasX, (lengthX + params->offBX)* sizeof(*clblasX), 0, CL_MEM_WRITE_ONLY); bufXTemp = base->createEnqueueBuffer(NULL, lengthX * sizeof(*clblasX), 0, CL_MEM_READ_ONLY); //printData( "bufX", blasX, lengthX, 1, lengthX); //printData( "clblasX", clblasX, lengthX, 1, lengthX); ::std::cerr << "Calling reference xTPMV routine... "; clblasOrder order; clblasUplo fUplo; clblasTranspose fTrans; order = params->order; fUplo = params->uplo; fTrans = params->transA; if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params->uplo == clblasUpper)? clblasLower : clblasUpper; fTrans = (params->transA == clblasNoTrans)? clblasTrans : clblasNoTrans; if( params->transA == clblasConjTrans ) doConjugate( (AP +params->offa), (( params->N * (params->N + 1)) / 2) , 1, 1 ); } ::clMath::blas::tpmv( order, fUplo, fTrans, params->diag, params->N, AP, params->offa, blasX, params->offBX, params->incx); ::std::cerr << "Done" << ::std::endl; // Hold X vector if ((bufAP == NULL) || (bufX == NULL) || (bufXTemp == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufAP, bufX, bufXTemp); deleteBuffers<T>(AP, blasX, clblasX); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xTPMV routine... "; DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE; // Should use bufXTemp as well err = (cl_int)::clMath::clblas::tpmv( type, params->order, params->uplo, params->transA, params->diag, params->N, bufAP, params->offa, bufX, params->offBX, params->incx, bufXTemp, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX, bufXTemp); deleteBuffers<T>(AP, blasX, clblasX); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TPMV() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX, bufXTemp); deleteBuffers<T>(AP, blasX, clblasX); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, (lengthX + params->offBX) * sizeof(*clblasX), clblasX, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "TPMV: Reading results failed...." << std::endl; } releaseMemObjects(bufAP, bufX, bufXTemp); compareMatrices<T>(clblasColumnMajor, lengthX , 1, (blasX + params->offBX), (clblasX + params->offBX), lengthX); deleteBuffers<T>(AP, blasX, clblasX); delete[] events; }
void hemmCorrectnessTest(TestParams *params) { cl_int err; T *A, *B, *C, *backC; T alpha_, beta_; cl_mem bufA, bufB, bufC; clMath::BlasBase *base; cl_event *events; size_t ka, kbc; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } events = new cl_event[params->numCommandQueues]; if (events == NULL) { std::cerr << ">> WARNING: Unable to allocate memory for events" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } memset(events, 0, params->numCommandQueues * sizeof(cl_event)); if( params->side == clblasLeft ) ka = params->M; else ka = params->N; if( params->order == clblasColumnMajor ) kbc = params->N; else kbc = params->M; size_t lengthA = ka * params->lda; size_t lengthB = kbc * params->ldb; size_t lengthC = kbc * params->ldc; alpha_ = convertMultiplier<T>(params->alpha); beta_ = convertMultiplier<T>(params->beta); A = new T[ lengthA + params->offA ]; B = new T[ lengthB + params->offBX ]; C = new T[ lengthC + params->offCY ]; backC = new T[ lengthC + params->offCY ]; if((A == NULL) || (B == NULL) || (C == NULL) || (backC == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers<T>(A, B, C, backC); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... " << std::endl; int creationFlags = 0, AcreationFlags; creationFlags = creationFlags | RANDOM_INIT; creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); AcreationFlags = ( (params-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY); BlasRoutineID BlasFn = CLBLAS_HEMM; populate( A + params->offA , ka, ka, params-> lda, BlasFn, AcreationFlags); populate( B + params->offBX , params-> M, params-> N, params-> ldb, BlasFn, creationFlags); populate( C + params->offCY , params-> M, params-> N, params-> ldc, BlasFn, creationFlags); memcpy(backC, C, (lengthC + params->offCY) * sizeof(T)); //printMatrixBlock( params->order, 0, 0, params->M, params->N, params->ldc, backC); // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offA) * sizeof(T), 0, CL_MEM_READ_ONLY); bufB = base->createEnqueueBuffer(B, (lengthB + params->offBX) * sizeof(T), 0, CL_MEM_READ_ONLY); bufC = base->createEnqueueBuffer(backC, (lengthC + params->offCY) * sizeof(T), 0, CL_MEM_READ_WRITE); ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling reference xHEMM routine... "; clblasOrder fOrder; clblasUplo fUplo; clblasSide fSide; size_t fN, fM; fOrder = params->order; fUplo = params->uplo; fSide = params->side; fM = params->M; fN = params->N; if (fOrder != clblasColumnMajor) { fOrder = clblasColumnMajor; fM = params->N; fN = params->M; fSide = (params->side == clblasLeft)? clblasRight: clblasLeft; fUplo = (params->uplo == clblasUpper)? clblasLower: clblasUpper; } // Call reference blas routine clMath::blas::hemm(fOrder, fSide, fUplo, fM, fN, alpha_, A, params->offA, params->lda, B, params->offBX, params->ldb, beta_, C, params->offCY, params->ldc); ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufB == NULL) || (bufC == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufB, bufC); deleteBuffers<T>(A, B, C, backC); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xHEMM routine... "; err = (cl_int)::clMath::clblas::hemm( params->order, params->side, params->uplo, params->M, params->N, alpha_, bufA, params->offA, params->lda, bufB, params->offBX, params->ldb, beta_, bufC, params->offCY, params->ldc, params->numCommandQueues, base->commandQueues(), 0, NULL, events ); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers<T>(A, B, C, backC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HEMM() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers<T>(A, B, C, backC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, 0, (lengthC + params->offCY) * sizeof(T), backC, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "WARNING: corr-hemm: Erorr reading buffer..." << err << ::std::endl; } //printMatrixBlock( params->order, 0, 0, params->M, params->N, params->ldc, backC); releaseMemObjects(bufA, bufB, bufC); // handle lda correctly based on row-major/col-major.. compareMatrices<T>(params->order, params->M , params->N, (C + params->offCY), (backC + params->offCY), params->ldc); deleteBuffers<T>(A, B, C, backC); delete[] events; }
void Extratest(size_t M, size_t N, size_t lda, size_t ldb, T alpha, T delta) { T *A, *B, *blasB, *clblasB; cl_mem bufA, bufB; clMath::BlasBase *base; cl_event *events; cl_int err; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } clblasOrder order = clblasColumnMajor; clblasSide side = clblasLeft; clblasUplo uplo = clblasUpper; clblasTranspose trans = clblasNoTrans; clblasDiag diag = clblasNonUnit; A = new T[M * lda]; B = new T[N * ldb]; blasB = new T[N * ldb]; clblasB = new T[N * ldb]; memset(A, 0, M*lda*sizeof(T)); memset(B, 0, N*ldb*sizeof(T)); for(int i=0; i<M; i++) // down each column { for(int j=0; j<M; j++) // down each row { AssignA<T>(A, i, j, lda); } } for(int i=0; i<N; i++) // down each column { for(int j=0; j<M; j++) // down each row { AssignB<T>(B, i, j, ldb, M); } } memcpy(blasB, B, N*ldb*sizeof(T)); memcpy(clblasB, B, N*ldb*sizeof(T)); ::std::cerr << "Calling reference xTRSM routine... "; ::clMath::blas::trsm(order, side, uplo, trans, diag, M, N, alpha, A, lda, blasB, ldb); bufA = base->createEnqueueBuffer(A, M*lda*sizeof(T), 0, CL_MEM_READ_ONLY); bufB = base->createEnqueueBuffer(clblasB, N*ldb*sizeof(T), 0, CL_MEM_READ_WRITE); events = new cl_event[1]; memset(events, 0, sizeof(cl_event)); if ((bufA == NULL) || (bufB == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufB); deleteBuffers<T>(A, B, blasB, clblasB, NULL); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xTRSM routine... "; err = (cl_int)::clMath::clblas::trsm(order, side, uplo, trans, diag, M, N, alpha, bufA, 0, lda, bufB, 0, ldb, 1, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB); deleteBuffers<T>(A, B, blasB, clblasB, NULL); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TRSM() failed"; } err = waitForSuccessfulFinish(1, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB); deleteBuffers<T>(A, B, blasB, clblasB, NULL); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufB, CL_TRUE, 0, N*ldb*sizeof(T), clblasB, 0, NULL, NULL); releaseMemObjects(bufA, bufB); // Validate the answer for(int i=0; i<N; i++) // down each column { for(int j=0; j<ldb; j++) // down each row { local_assert(blasB[i*ldb + j], clblasB[i*ldb + j], delta); } } deleteBuffers<T>(A, B, blasB, clblasB, NULL); delete[] events; }
void rotgCorrectnessTest(TestParams *params) { cl_int err; T1 *SA, *SB, *S, *back_SA, *back_SB, *back_S; T2 *C, *back_C; cl_mem bufSA, bufSB, bufC, bufS; clMath::BlasBase *base; cl_event *events; cl_double deltaForType = 0.0; base = clMath::BlasBase::getInstance(); if ((typeid(T1) == typeid(cl_double) || typeid(T1) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t length = 1;//only one element need to be accessed always SA = new T1[length + params->offBX ]; SB = new T1[length + params->offCY ]; C = new T2[length + params->offa ]; S = new T1[length + params->offb ]; back_SA = new T1[length + params->offBX ]; back_SB = new T1[length + params->offCY ]; back_C = new T2[length + params->offa ]; back_S = new T1[length + params->offb ]; if((SA == NULL) || (SB == NULL) || (C == NULL) || (S == NULL) || (back_SA == NULL) || (back_SB == NULL) || (back_C == NULL) || (back_S == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers<T1>(SA, SB, S, back_SA, back_SB, back_S); deleteBuffers<T2>(C, back_C); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; //Filling random values for SA and SB. C & S are only for output sake randomVectors(1, (SA+params->offBX), 1, (SB+params->offCY), 1); S[params->offb] = back_S[params->offb] = ZERO<T1>(); C[params->offa] = back_C[params->offa] = ZERO<T2>(); back_SA[params->offBX] = SA[params->offBX]; back_SB[params->offCY] = SB[params->offCY]; ::std::cerr << "Done" << ::std::endl; //printing the inputs, as they change after processing ::std::cerr << "A = "; printElement<T1>(SA[params->offBX]); ::std::cerr << "\tB = "; printElement<T1>(SB[params->offCY]); ::std::cerr << "\tC = "; printElement<T2>(C[params->offa]); ::std::cerr << "\tS = "; printElement<T1>(S[params->offb]); ::std::cout << std::endl << std::endl; // Allocate buffers bufSA = base->createEnqueueBuffer(SA, (length + params->offBX) * sizeof(T1), 0, CL_MEM_READ_WRITE); bufSB = base->createEnqueueBuffer(SB, (length + params->offCY) * sizeof(T1), 0, CL_MEM_READ_WRITE); bufC = base->createEnqueueBuffer(C, (length + params->offa ) * sizeof(T2), 0, CL_MEM_WRITE_ONLY); bufS = base->createEnqueueBuffer(S, (length + params->offb ) * sizeof(T1), 0, CL_MEM_WRITE_ONLY); ::std::cerr << "Calling reference xROTG routine... "; ::clMath::blas::rotg(back_SA, params->offBX, back_SB, params->offCY, back_C, params->offa, back_S, params->offb); ::std::cerr << "Done" << ::std::endl; // Hold X vector if ((bufSA == NULL) || (bufSB == NULL) || (bufC == NULL) || (bufS == NULL)) { releaseMemObjects(bufSA, bufSB, bufC, bufS); deleteBuffers<T1>(SA, SB, S, back_SA, back_SB, back_S); deleteBuffers<T2>(C, back_C); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xROTG routine... "; DataType type; type = ( typeid(T1) == typeid(cl_float)) ? TYPE_FLOAT : ( typeid(T1) == typeid(cl_double)) ? TYPE_DOUBLE: ( typeid(T1) == typeid(cl_float2)) ? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; err = (cl_int)::clMath::clblas::rotg( type, bufSA, params->offBX, bufSB, params->offCY, bufC, params->offa, bufS, params->offb, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufSA, bufSB, bufC, bufS); deleteBuffers<T1>(SA, SB, S, back_SA, back_SB, back_S); deleteBuffers<T2>(C, back_C); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::ROTG() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufSA, bufSB, bufC, bufS); deleteBuffers<T1>(SA, SB, S, back_SA, back_SB, back_S); deleteBuffers<T2>(C, back_C); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufSA, CL_TRUE, 0, (length + params->offBX) * sizeof(T1), SA, 0, NULL, NULL); err |= clEnqueueReadBuffer(base->commandQueues()[0], bufSB, CL_TRUE, 0, (length + params->offCY) * sizeof(T1), SB, 0, NULL, NULL); err |= clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, 0, (length + params->offa) * sizeof(T2), C, 0, NULL, NULL); err |= clEnqueueReadBuffer(base->commandQueues()[0], bufS, CL_TRUE, 0, (length + params->offb) * sizeof(T1), S, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "ROTG: Reading results failed...." << std::endl; } releaseMemObjects(bufSA, bufSB, bufC, bufS); deltaForType = DELTA_0<T1>(); cl_double delta; delta = deltaForType * returnMax<T1>(back_SA[params->offBX]); compareValues<T1>( (back_SA + params->offBX), (SA + params->offBX), delta); delta = deltaForType * returnMax<T1>(back_SB[params->offCY]); compareValues<T1>( (back_SB + params->offCY), (SB + params->offCY), delta); delta = deltaForType * returnMax<T2>(back_C[params->offa]); compareValues<T2>( (back_C + params->offa), (C + params->offa), delta); delta = deltaForType * returnMax<T1>(back_S[params->offb]); compareValues<T1>( (back_S + params->offb), (S + params->offb), delta); deleteBuffers<T1>(SA, SB, S, back_SA, back_SB, back_S); deleteBuffers<T2>(C, back_C); delete[] events; }
void trsmCorrectnessTest(TestParams *params) { cl_int err; T *A, *B, *blasB, *clblasB; T alpha; cl_mem bufA, bufB; cl_double *delta; clMath::BlasBase *base; bool useAlpha; cl_event *events; bool isComplex; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } isComplex = ((typeid(T) == typeid(FloatComplex)) || (typeid(T) == typeid(DoubleComplex))); if (canCaseBeSkipped(params, isComplex)) { std::cerr << ">> Test is skipped because it has no importance for this " "level of coverage" << std::endl; SUCCEED(); return; } useAlpha = base->useAlpha(); alpha = ZERO<T>(); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); A = new T[params->rowsA * params->columnsA]; B = new T[params->rowsB * params->columnsB]; blasB = new T[params->rowsB * params->columnsB]; clblasB = new T[params->rowsB * params->columnsB]; delta = new cl_double[params->rowsB * params->columnsB]; srand(params->seed); if (useAlpha) { alpha = convertMultiplier<T>(params->alpha); } ::std::cerr << "Generating input data... "; randomTrsmMatrices<T>(params->order, params->side, params->uplo, params->diag, params->M, params->N, useAlpha, &alpha, A, params->lda, B, params->ldb); memcpy(blasB, B, params->rowsB * params->columnsB * sizeof(*B)); memcpy(clblasB, B, params->rowsB * params->columnsB * sizeof(*B)); ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling reference xTRSM routine... "; if (params->order == clblasColumnMajor) { ::clMath::blas::trsm(clblasColumnMajor, params->side, params->uplo, params->transA, params->diag, params->M, params->N, alpha, A, params->lda, blasB, params->ldb); } else { T *reorderedA = new T[params->rowsA * params->columnsA]; T *reorderedB = new T[params->rowsB * params->columnsB]; reorderMatrix<T>(clblasRowMajor, params->rowsA, params->columnsA, A, reorderedA); reorderMatrix<T>(clblasRowMajor, params->rowsB, params->columnsB, blasB, reorderedB); ::clMath::blas::trsm(clblasColumnMajor, params->side, params->uplo, params->transA, params->diag, params->M, params->N, alpha, reorderedA, params->rowsA, reorderedB, params->rowsB); reorderMatrix<T>(clblasColumnMajor, params->rowsB, params->columnsB, reorderedB, blasB); delete[] reorderedB; delete[] reorderedA; } ::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), CL_MEM_READ_ONLY); bufB = base->createEnqueueBuffer(clblasB, params->rowsB * params->columnsB * sizeof(*clblasB), params->offBX * sizeof(*clblasB), CL_MEM_READ_WRITE); if ((bufA == NULL) || (bufB == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufB); deleteBuffers<T>(A, B, blasB, clblasB, delta); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xTRSM routine... "; err = (cl_int)::clMath::clblas::trsm(params->order, params->side, params->uplo, params->transA, params->diag, params->M, params->N, alpha, bufA, params->offA, params->lda, bufB, params->offBX, params->ldb, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB); deleteBuffers<T>(A, B, blasB, clblasB, delta); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TRSM() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB); deleteBuffers<T>(A, B, blasB, clblasB, delta); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufB, CL_TRUE, params->offBX * sizeof(*clblasB), params->rowsB * params->columnsB * sizeof(*clblasB), clblasB, 0, NULL, NULL); releaseMemObjects(bufA, bufB); trsmDelta<T>(params->order, params->side, params->uplo, params->transA, params->diag, params->M, params->N, A, params->lda, B, params->ldb, alpha, delta); compareMatrices<T>(params->order, params->M, params->N, blasB, clblasB, params->ldb, delta); deleteBuffers<T>(A, B, blasB, clblasB, delta); delete[] events; }
void axpyCorrectnessTest(TestParams *params) { cl_int err; T *X, *Y; //For OpenCL implementation T *blasX, *blasY;// For reference implementation cl_mem bufX, bufY; clMath::BlasBase *base; cl_event *events; T alpha; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); size_t lengthY = (1 + ((params->N -1) * abs(params->incy))); X = new T[lengthX + params->offBX ]; Y = new T[lengthY + params->offCY ]; blasX = new T[lengthX + params->offBX ]; blasY = new T[lengthY + params->offCY ]; if((X == NULL) || (blasX == NULL) || (Y == NULL) || (blasY == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers<T>(X, Y, blasX, blasY); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; // Populate X and Y randomVectors(params->N, (X+params->offBX), params->incx, (Y+params->offCY), params->incy); memcpy(blasX, X, (lengthX + params->offBX) * sizeof(T)); memcpy(blasY, Y, (lengthY + params->offCY) * sizeof(T)); alpha = convertMultiplier<T>(params->alpha); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(T), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(Y, (lengthY + params->offCY)* sizeof(T), 0, CL_MEM_READ_WRITE); if ((bufX == NULL) || (bufY == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufX, bufY); deleteBuffers<T>(X, Y, blasX, blasY); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling reference xAXPY routine... "; ::clMath::blas::axpy((size_t)params->N, alpha, blasX, (size_t)params->offBX, params->incx, blasY, (size_t)params->offCY, params->incy); ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling clblas xAXPY routine... "; err = (cl_int)::clMath::clblas::axpy(params->N, alpha, bufX, params->offBX, params->incx, bufY, params->offCY, params->incy, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufY); deleteBuffers<T>(X, Y, blasX, blasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::AXPY() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufY); deleteBuffers<T>(X, Y, blasX, blasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (lengthY + params->offCY) * sizeof(T), Y, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "AXPY: Reading results failed...." << std::endl; } releaseMemObjects(bufX, bufY); compareMatrices<T>(clblasRowMajor, lengthY , 1, (blasY + params->offCY), (Y + params->offCY), 1); deleteBuffers<T>(X, Y, blasX, blasY); delete[] events; }
void sprCorrectnessTest(TestParams *params) { cl_int err; T *blasAP, *clblasAP, *X; // T *tempA; cl_mem bufAP, bufX; clMath::BlasBase *base; cl_event *events; bool useAlpha; T alpha; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthAP = ( ( params->N*( params->N + 1 ) )/2 ); size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); blasAP = new T[lengthAP + params->offa]; clblasAP = new T[lengthAP + params->offa]; X = new T[lengthX + params->offBX]; // tempA = new T[lengthA + params->offa ]; srand(params->seed); memset(blasAP, -1, (lengthAP + params->offa)); memset(clblasAP, -1, (lengthAP + params->offa)); memset(X, -1, (lengthX + params->offBX)); alpha = convertMultiplier<T>(params->alpha); useAlpha = true; #ifdef DEBUG_SPR printf("ALPHA in CORR_SPR.CPP %f\n", alpha); #endif if((blasAP == NULL) || (X == NULL) || (clblasAP == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers<T>(blasAP, clblasAP, X); delete[] events; SUCCEED(); return; } randomSyrMatrices<T>(params->order, params->uplo, params->N, useAlpha, &alpha, (blasAP + params->offa), 0, (X + params->offBX), params->incx); memcpy(clblasAP, blasAP, (lengthAP + params->offa)* sizeof(*blasAP)); bufAP = base->createEnqueueBuffer(clblasAP, (lengthAP + params->offa) * sizeof(*clblasAP), 0, CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); clblasOrder order; clblasUplo fUplo; order = params->order; fUplo = params->uplo; if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params->uplo == clblasUpper)? clblasLower : clblasUpper; if( params->transA == clblasConjTrans ) doConjugate( (blasAP +params->offa), (( params->N * (params->N + 1)) / 2) , 1, 1 ); } clMath::blas::spr( clblasColumnMajor, fUplo, params->N, alpha, X, params->offBX, params->incx, blasAP, params->offa); if ((bufAP == NULL) || (bufX == NULL) ) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufAP, bufX); deleteBuffers<T>(blasAP, clblasAP, X); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } err = (cl_int)::clMath::clblas::spr( params->order, params->uplo, params->N, alpha, bufX, params->offBX, params->incx, bufAP, params->offa, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX); deleteBuffers<T>(blasAP, clblasAP, X); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SYR() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX); deleteBuffers<T>(blasAP, clblasAP, X); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } err = clEnqueueReadBuffer(base->commandQueues()[0], bufAP, CL_TRUE, 0, (lengthAP + params->offa) * sizeof(*clblasAP), clblasAP, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "SPR: Reading results failed...." << std::endl; } releaseMemObjects(bufAP, bufX); compareMatrices<T>(clblasColumnMajor, lengthAP , 1, (blasAP + params->offa), (clblasAP + params->offa), lengthAP); deleteBuffers<T>(blasAP, clblasAP, X); delete[] events; }
void rotmgCorrectnessTest(TestParams *params) { cl_int err; T *D1, *D2, *X, *Y, *PARAM; T *back_D1, *back_D2, *back_X, *back_Y, *back_PARAM; T sflagParam; cl_mem bufD1, bufD2, bufX, bufY, bufParam; clMath::BlasBase *base; cl_event *events; cl_double deltaForType = 0.0; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); X = new T[1 + params->offBX]; Y = new T[1 + params->offCY]; D1 = new T[1 + params->offa]; D2 = new T[1 + params->offb]; PARAM = new T[5 + params->offc]; //params always has 5 elements back_X = new T[1 + params->offBX]; back_Y = new T[1 + params->offCY]; back_D1 = new T[1 + params->offa]; back_D2 = new T[1 + params->offb]; back_PARAM = new T[5 + params->offc]; //params always has 5 elements if((D1 == NULL) || (D2 == NULL) || (X == NULL) || (Y == NULL) || (PARAM == NULL) || (back_D1 == NULL) || (back_D2 == NULL) ||(back_X == NULL) || (back_Y == NULL) || (back_PARAM == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers<T>(D1, D2, X, Y, PARAM); deleteBuffers<T>(back_D1, back_D2, back_X, back_Y, back_PARAM); delete[] events; SUCCEED(); return; } srand(params->seed); //Filling random values for SA and SB. C & S are only for output sake randomRotmg( (D1 + params->offa), (D2 + params->offb), (X + params->offBX), (Y + params->offCY), (PARAM + params->offc) ); sflagParam = convertMultiplier<T>(params->alpha); PARAM[params->offc] = sflagParam; // initializing first element memcpy(back_X, X, (1 + params->offBX)*sizeof(T)); memcpy(back_Y, Y, (1 + params->offCY)*sizeof(T)); memcpy(back_D1, D1, (1 + params->offa)*sizeof(T)); memcpy(back_D2, D2, (1 + params->offb)*sizeof(T)); memcpy(back_PARAM, PARAM, (params->offc + 5)*sizeof(T)); // Allocate buffers bufD1 = base->createEnqueueBuffer(D1, (1 + params->offa) * sizeof(T), 0, CL_MEM_READ_WRITE); bufD2 = base->createEnqueueBuffer(D2, (1 + params->offb) * sizeof(T), 0, CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(X, (1 + params->offBX) * sizeof(T), 0, CL_MEM_READ_WRITE); bufY = base->createEnqueueBuffer(Y, (1 + params->offCY) * sizeof(T), 0, CL_MEM_READ_ONLY); bufParam = base->createEnqueueBuffer(PARAM, (5 + params->offc) * sizeof(T), 0, CL_MEM_READ_WRITE); ::clMath::blas::rotmg(back_D1, params->offa, back_D2, params->offb, back_X, params->offBX, back_Y, params->offCY, back_PARAM, params->offc); // Hold X vector if ((bufD1 == NULL) || (bufD2 == NULL) || (bufX == NULL) || (bufY == NULL) || (bufParam == NULL)) { releaseMemObjects(bufD1, bufD2, bufX, bufY, bufParam); deleteBuffers<T>(D1, D2, X, Y, PARAM); deleteBuffers<T>(back_D1, back_D2, back_X, back_Y, back_PARAM); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } DataType type; type = ( typeid(T) == typeid(cl_float)) ? TYPE_FLOAT : TYPE_DOUBLE; err = (cl_int)::clMath::clblas::rotmg( type, bufD1, params->offa, bufD2, params->offb, bufX, params->offBX, bufY, params->offCY, bufParam, params->offc, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufD1, bufD2, bufX, bufY, bufParam); deleteBuffers<T>(D1, D2, X, Y, PARAM); deleteBuffers<T>(back_D1, back_D2, back_X, back_Y, back_PARAM); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::ROTMG() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufD1, bufD2, bufX, bufY, bufParam); deleteBuffers<T>(D1, D2, X, Y, PARAM); deleteBuffers<T>(back_D1, back_D2, back_X, back_Y, back_PARAM); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } err = clEnqueueReadBuffer(base->commandQueues()[0], bufD1, CL_TRUE, 0, (1 + params->offa) * sizeof(T), D1, 0, NULL, NULL); err |= clEnqueueReadBuffer(base->commandQueues()[0], bufD2, CL_TRUE, 0, (1 + params->offb) * sizeof(T), D2, 0, NULL, NULL); err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, (1 + params->offBX) * sizeof(T), X, 0, NULL, NULL); err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (1 + params->offCY) * sizeof(T), Y, 0, NULL, NULL); err |= clEnqueueReadBuffer(base->commandQueues()[0], bufParam, CL_TRUE, 0, (5 + params->offc) * sizeof(T), PARAM, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "ROTMG: Reading results failed...." << std::endl; } releaseMemObjects(bufD1, bufD2, bufX, bufY, bufParam); deltaForType = DELTA_0<T>(); #ifndef CORR_TEST_WITH_ACML // Acml doesn't store answer in D1, D2 and X1. So skipping those checks cl_double delta; delta = deltaForType * returnMax<T>(back_D1[params->offa]); compareValues<T>( (back_D1 + params->offa), (D1 + params->offa), delta); delta = deltaForType * returnMax<T>(back_D2[params->offb]); compareValues<T>( (back_D2 + params->offb), (D2 + params->offb), delta); delta = deltaForType * returnMax<T>(back_X[params->offBX]); compareValues<T>( (back_X + params->offBX), (X + params->offBX), delta); delta = deltaForType * returnMax<T>(back_Y[params->offCY]); compareValues<T>( (back_Y + params->offCY), (Y + params->offCY), delta); #endif // Creating delta array for PARAM array cl_double deltaArr[5]; for(int i=0; i<5; i++) { deltaArr[i] = deltaForType * returnMax<T>(back_PARAM[i + (params->offc)]); } compareMatrices<T>(clblasColumnMajor, 5 , 1, (back_PARAM + params->offc), (PARAM + params->offc), 5, deltaArr); if (::testing::Test::HasFailure()) { printTestParams(params->offBX, params->offCY, params->offa, params->offb, params->offc, params->alpha); ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; } deleteBuffers<T>(D1, D2, X, Y, PARAM); deleteBuffers<T>(back_D1, back_D2, back_X, back_Y, back_PARAM); delete[] events; }
void copyCorrectnessTest(TestParams *params) { cl_int err; T *blasX, *blasY, *clblasY; cl_mem bufX, bufY; clMath::BlasBase *base; cl_event *events; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); size_t lengthY = (1 + ((params->N -1) * abs(params->incy))); blasX = new T[lengthX + params->offBX ]; blasY = new T[lengthY + params->offCY ]; clblasY = new T[lengthY + params->offCY ]; if((blasX == NULL) || (blasY == NULL) || (clblasY == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers<T>(blasX, blasY, clblasY); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; // Populate A and blasX randomVectors( params->N, (blasX+params->offBX), params->incx, (blasY+params->offCY), params->incy ); memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE); bufY = base->createEnqueueBuffer(blasY, (lengthY + params->offCY)* sizeof(*blasY), 0, CL_MEM_READ_WRITE); ::std::cerr << "Calling reference xCOPY routine... "; ::clMath::blas::copy( params->N, blasX, params->offBX, params->incx, blasY, params->offCY, params->incy); ::std::cerr << "Done" << ::std::endl; if ((bufX == NULL) || (bufY == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufX, bufY); deleteBuffers<T>(blasX, blasY, clblasY); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xCOPY routine... "; DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE; // Should use bufXTemp as well err = (cl_int)::clMath::clblas::copy(type, params->N, bufX, params->offBX, params->incx, bufY, params->offCY, params->incy, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufY); deleteBuffers<T>(blasX, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::COPY() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufY); deleteBuffers<T>(blasX, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, ((lengthY + params->offCY) * sizeof(*blasY)), clblasY, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "COPY: Reading results failed...." << std::endl; } releaseMemObjects(bufX, bufY); compareMatrices<T>(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY), lengthY, NULL); deleteBuffers<T>(blasX, blasY, clblasY); delete[] events; }
void gbmvCorrectnessTest(TestParams *params) { cl_int err; T *A, *X, *blasY, *clblasY; cl_mem bufA, bufX, bufY; clMath::BlasBase *base; cl_event *events; T alpha, beta; size_t lengthX, lengthY, lengthA; base = clMath::BlasBase::getInstance(); if (( (typeid(T) == typeid(DoubleComplex)) || (typeid(T) == typeid(cl_double)) ) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); lengthA = ((params->order == clblasColumnMajor)? params->N : params->M) * params->lda; if (params->transA == clblasNoTrans) { lengthX = (params->N - 1)*abs(params->incx) + 1; lengthY = (params->M - 1)*abs(params->incy) + 1; } else { lengthX = (params->M - 1)*abs(params->incx) + 1; lengthY = (params->N - 1)*abs(params->incy) + 1; } A = new T[lengthA + params->offA ]; X = new T[lengthX + params->offBX ]; blasY = new T[lengthY + params->offCY ]; clblasY = new T[lengthY + params->offCY ]; srand(params->seed); if((A == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL)) { deleteBuffers<T>(A, X, blasY, clblasY); ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; delete[] events; SUCCEED(); return; } alpha = convertMultiplier<T>(params->alpha); beta = convertMultiplier<T>(params->beta); randomGbmvMatrices(params->order, params->transA, params->M, params->N, &alpha, &beta, (A + params->offA), params->lda, (X+params->offBX), params->incx, (blasY+params->offCY), params->incy ); // Copy blasY to clblasY memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY)); // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offA)* sizeof(*A), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE); clblasOrder fOrder; clblasTranspose fTrans; fOrder = params->order; fTrans = params->transA; size_t fM = params->M, fN = params->N, fKL = params->KL, fKU = params->KU; if (fOrder != clblasColumnMajor) { fOrder = clblasColumnMajor; fTrans = (params->transA == clblasNoTrans)? clblasTrans : clblasNoTrans; fM = params->N; fN = params->M; fKL = params->KU; fKU = params->KL; if( params->transA == clblasConjTrans ) doConjugate( (A+params->offa), 1, lengthA, params->lda ); } clMath::blas::gbmv(fOrder, fTrans, fM, fN, fKL, fKU, alpha, A, params->offA, params->lda, X, params->offBX, params->incx, beta, blasY, params->offCY, params->incy); if ((bufA == NULL) || (bufX == NULL) || (bufY == NULL)) { // Skip the test, the most probable reason is // matrix too big for a device. releaseMemObjects(bufA, bufX, bufY); deleteBuffers<T>(A, X, blasY, clblasY); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } err = (cl_int)clMath::clblas::gbmv(params->order, params->transA, params->M, params->N, params->KL, params->KU, alpha, bufA, params->offA, params->lda, bufX, params->offBX, params->incx, beta, bufY, params->offCY, params->incy, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX, bufY); deleteBuffers<T>(A, X, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::GBMV() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX, bufY); deleteBuffers<T>(A, X, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "GBMV: Reading results failed...." << std::endl; } releaseMemObjects(bufA, bufX, bufY); compareMatrices<T>(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY), lengthY); if (::testing::Test::HasFailure()) { printTestParams(params->order, params->transA, params->M, params->N, params->KL, params->KU, params->alpha, params->offA, params->lda, params->offBX, params->incx, params->beta, params->offCY, params->incy); ::std::cerr << "seed = " << params->seed << ::std::endl; ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; } deleteBuffers<T>(A, X, blasY, clblasY); delete[] events; }
void gercCorrectnessTest(TestParams *params) { cl_int err; T *A, *x, *y, *backA; //size_t N, M; T alpha_; cl_mem bufA, bufx, bufy; clMath::BlasBase *base; cl_event *events; // int ka, kxy; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthA; if( params->order == clblasColumnMajor ) lengthA = params->N * params->lda; else lengthA = params->M * params->lda; size_t lengthx = (1 + (((params->M)-1) * abs(params->incx))); size_t lengthy = (1 + (((params->N)-1) * abs(params->incy))); bool useAlpha = base->useAlpha(); if (useAlpha) { alpha_ = convertMultiplier<T>(params->alpha); } A = new T[lengthA + params->offa]; x = new T[lengthx + params->offBX]; y = new T[lengthy + params->offCY]; backA = new T[lengthA + params->offa]; if((A == NULL) || (backA == NULL) || (x == NULL) || (y == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers<T>(A, x, y, backA); delete[] events; SUCCEED(); return; } srand(params->seed); int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT; creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); BlasRoutineID BlasFn = CLBLAS_GER; populate( (A + params->offa), params->M, params->N, params-> lda, BlasFn, creationFlags); populate( (x + params->offBX), lengthx, 1, lengthx, BlasFn ); populate( (y + params->offCY), lengthy, 1, lengthy, BlasFn ); // Copy C to backX memcpy(backA, A, (lengthA + params->offa) * sizeof(T)); // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offa) * sizeof(*A), 0, CL_MEM_READ_WRITE); bufx = base->createEnqueueBuffer(x, (lengthx + params->offBX) * sizeof(*x), 0, CL_MEM_READ_ONLY); bufy = base->createEnqueueBuffer(y, (lengthy + params->offCY) * sizeof(*y), 0, CL_MEM_READ_ONLY); clblasOrder fOrder; size_t fN, fM; size_t fOffx, fOffy; int fIncx, fIncy; T *fX, *fY; fOrder = params->order; fM = params->M; fN = params->N; fIncx = params->incx; fIncy = params->incy; fX = x; fY = y; fOffx = params->offBX; fOffy = params->offCY; if (fOrder != clblasColumnMajor) { doConjugate( (y + params->offCY), (1 + (params->N-1) * abs(params->incy)), 1, 1 ); fOrder = clblasColumnMajor; fM = params->N; fN = params->M; fX = y; fY = x; fIncx = params->incy; fIncy = params->incx; fOffx = params->offCY; fOffy = params->offBX; // Note this according to the Legacy guide clMath::blas::ger(fOrder, fM, fN, alpha_, fX , fOffx, fIncx, fY, fOffy, fIncy, A, params->offa, params->lda); } else { clMath::blas::gerc(fOrder, fM, fN, alpha_, fX , fOffx, fIncx, fY, fOffy, fIncy, A, params->offa, params->lda); } if ((bufA == NULL) || (bufx == NULL) || (bufy == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufx, bufy); deleteBuffers<T>(A, x, y, backA); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } err = (cl_int)::clMath::clblas::gerc( params->order, params->M, params->N, alpha_, bufx, params->offBX, params->incx, bufy, params->offCY, params->incy,bufA, params->offa, params->lda, params->numCommandQueues, base->commandQueues(), 0, NULL, events ); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufx, bufy); deleteBuffers<T>(A, x, y, backA); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::GER() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufx, bufy); deleteBuffers<T>(A, x, y, backA); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } clEnqueueReadBuffer(base->commandQueues()[0], bufA, CL_TRUE, 0, (lengthA + params->offa)* sizeof(*backA), backA, 0, NULL, NULL); releaseMemObjects(bufA, bufx, bufy); // handle lda correctly based on row-major/col-major.. compareMatrices<T>(params->order, params->M , params->N, A+ params->offa, backA + params->offa, params->lda); if (::testing::Test::HasFailure()) { printTestParams(params->order, params->M, params->N, useAlpha, base->alpha(), params->lda, params->incx, params->incy, params->offa, params->offBX, params->offCY); ::std::cerr << "seed = " << params->seed << ::std::endl; ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; } deleteBuffers<T>(A, x, y, backA); delete[] events; }