template <typename ElemType> nano_time_t HpmvPerformanceTest<ElemType>::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; clblasUplo fUplo; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif order = params_.order; fUplo = params_.uplo; #ifdef PERF_TEST_WITH_ACML if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params_.uplo == clblasUpper)? clblasLower : clblasUpper; doConjugate( (AP_ + params_.offA), params_.N, params_.N, params_.lda ); doConjugate( (AP_ + params_.offA), ((params_.N * (params_.N + 1)) / 2 ), 1, 1 ); } time = getCurrentTime(); clMath::blas::hpmv(order, fUplo, params_.N, alpha, AP_, params_.offA, X_, params_.offBX, params_.incx, beta, Y_, params_.offCY, params_.incy); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; }
template <typename ElemType> nano_time_t Her2PerformanceTest<ElemType>::etalonPerfSingle(void) { clblasOrder order; clblasUplo fUplo; nano_time_t time = 0; size_t lda; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif order = params_.order; lda = params_.lda; fUplo = params_.uplo; #ifdef PERF_TEST_WITH_ACML ElemType *fX, *fY; int fIncx, fIncy; size_t fOffx, fOffy; fX = X_; fOffx = params_.offBX; fIncx = params_.incx; fY = Y_; fOffy = params_.offCY; fIncy = params_.incy; if (order != clblasColumnMajor) { doConjugate( (X_ + params_.offBX), (1 + (params_.N-1) * abs(params_.incx)), 1, 1 ); doConjugate( (Y_ + params_.offCY), (1 + (params_.N-1) * abs(params_.incy)), 1, 1 ); order = clblasColumnMajor; fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower; fX = Y_; fOffx = params_.offCY; fIncx = params_.incy; fY = X_; fOffy = params_.offBX; fIncy = params_.incx; } time = getCurrentTime(); clMath::blas::her2(order, fUplo, params_.N, alpha_, fX, fOffx, fIncx, fY, fOffy, fIncy, A_, params_.offa, lda); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; }
template <typename ElemType> nano_time_t TrsvPerformanceTest<ElemType>::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; clblasUplo fUplo; clblasTranspose fTrans; size_t lda; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif memcpy(X_, backX_, ((1 + ((params_.N-1) * abs(params_.incx)))+params_.offBX) * sizeof(ElemType)); order = params_.order; fUplo = params_.uplo; fTrans = params_.transA; lda = params_.lda; #ifdef PERF_TEST_WITH_ACML if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params_.uplo == clblasUpper)? clblasLower : clblasUpper; fTrans = (params_.transA == clblasNoTrans)? clblasTrans : clblasNoTrans; if( params_.transA == clblasConjTrans ) doConjugate( A_ + params_.offa, params_.N, params_.N, lda ); } time = getCurrentTime(); clMath::blas::trsv(order, fUplo, fTrans, params_.diag, params_.N, A_, params_.offa, lda, X_, params_.offBX, params_.incx); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; }
template <typename ElemType> nano_time_t GbmvPerformanceTest<ElemType>::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder fOrder; clblasTranspose fTrans; size_t lda, lenY, lenA; size_t fM = params_.M, fN = params_.N, fKL = params_.KL, fKU = params_.KU; lenA = ((params_.order == clblasColumnMajor)? params_.N : params_.M) * params_.lda; lenY = (((params_.transA == clblasNoTrans)? params_.M : params_.N) - 1)* params_.incy + 1 + params_.offCY; memcpy(Y_, backY_, lenY * sizeof(ElemType)); fOrder = params_.order; fTrans = params_.transA; lda = params_.lda; if (fOrder != clblasColumnMajor) { fOrder = clblasColumnMajor; fTrans = (params_.transA == clblasNoTrans)? clblasTrans : clblasNoTrans; fM = params_.N; fN = params_.M; fKL = params_.KU; fKU = params_.KL; if( params_.transA == clblasConjTrans ) doConjugate( (A_+params_.offa), 1, lenA, lda ); } #ifdef PERF_TEST_WITH_ACML time = getCurrentTime(); clMath::blas::gbmv(fOrder, fTrans, fM, fN, fKL, fKU, alpha, A_, params_.offA, lda, X_, params_.offBX, params_.incx, beta, Y_, params_.offCY, params_.incy); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; }
template <typename ElemType> nano_time_t HprPerformanceTest<ElemType>::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; // size_t lda; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif order = params_.order; #ifdef PERF_TEST_WITH_ACML clblasOrder fOrder; clblasUplo fUplo; fOrder = params_.order; fUplo = params_.uplo; if (order != clblasColumnMajor) { doConjugate( (X_ + params_.offBX), (1 + (params_.N-1) * abs(params_.incx)), 1, 1 ); fOrder = clblasColumnMajor; fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower; } time = getCurrentTime(); clMath::blas::hpr(fOrder, fUplo, params_.N, CREAL(alpha_), X_, params_.offBX, params_.incx, AP_, params_.offa); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; }
void gbmvCorrectnessTest(TestParams *params) { cl_int err; T *A, *X, *blasY, *clblasY; cl_mem bufA, bufX, bufY; clMath::BlasBase *base; cl_event *events; T alpha, beta; size_t lengthX, lengthY, lengthA; base = clMath::BlasBase::getInstance(); if (( (typeid(T) == typeid(DoubleComplex)) || (typeid(T) == typeid(cl_double)) ) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); lengthA = ((params->order == clblasColumnMajor)? params->N : params->M) * params->lda; if (params->transA == clblasNoTrans) { lengthX = (params->N - 1)*abs(params->incx) + 1; lengthY = (params->M - 1)*abs(params->incy) + 1; } else { lengthX = (params->M - 1)*abs(params->incx) + 1; lengthY = (params->N - 1)*abs(params->incy) + 1; } A = new T[lengthA + params->offA ]; X = new T[lengthX + params->offBX ]; blasY = new T[lengthY + params->offCY ]; clblasY = new T[lengthY + params->offCY ]; srand(params->seed); if((A == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL)) { deleteBuffers<T>(A, X, blasY, clblasY); ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; delete[] events; SUCCEED(); return; } alpha = convertMultiplier<T>(params->alpha); beta = convertMultiplier<T>(params->beta); randomGbmvMatrices(params->order, params->transA, params->M, params->N, &alpha, &beta, (A + params->offA), params->lda, (X+params->offBX), params->incx, (blasY+params->offCY), params->incy ); // Copy blasY to clblasY memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY)); // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offA)* sizeof(*A), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE); clblasOrder fOrder; clblasTranspose fTrans; fOrder = params->order; fTrans = params->transA; size_t fM = params->M, fN = params->N, fKL = params->KL, fKU = params->KU; if (fOrder != clblasColumnMajor) { fOrder = clblasColumnMajor; fTrans = (params->transA == clblasNoTrans)? clblasTrans : clblasNoTrans; fM = params->N; fN = params->M; fKL = params->KU; fKU = params->KL; if( params->transA == clblasConjTrans ) doConjugate( (A+params->offa), 1, lengthA, params->lda ); } clMath::blas::gbmv(fOrder, fTrans, fM, fN, fKL, fKU, alpha, A, params->offA, params->lda, X, params->offBX, params->incx, beta, blasY, params->offCY, params->incy); if ((bufA == NULL) || (bufX == NULL) || (bufY == NULL)) { // Skip the test, the most probable reason is // matrix too big for a device. releaseMemObjects(bufA, bufX, bufY); deleteBuffers<T>(A, X, blasY, clblasY); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } err = (cl_int)clMath::clblas::gbmv(params->order, params->transA, params->M, params->N, params->KL, params->KU, alpha, bufA, params->offA, params->lda, bufX, params->offBX, params->incx, beta, bufY, params->offCY, params->incy, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX, bufY); deleteBuffers<T>(A, X, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::GBMV() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX, bufY); deleteBuffers<T>(A, X, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "GBMV: Reading results failed...." << std::endl; } releaseMemObjects(bufA, bufX, bufY); compareMatrices<T>(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY), lengthY); if (::testing::Test::HasFailure()) { printTestParams(params->order, params->transA, params->M, params->N, params->KL, params->KU, params->alpha, params->offA, params->lda, params->offBX, params->incx, params->beta, params->offCY, params->incy); ::std::cerr << "seed = " << params->seed << ::std::endl; ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; } deleteBuffers<T>(A, X, blasY, clblasY); delete[] events; }
void hpmvCorrectnessTest(TestParams *params) { cl_int err; T *AP, *X, *blasY, *clblasY; cl_mem bufAP, bufX, bufY; clMath::BlasBase *base; cl_event *events; T alpha, beta; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthA = (params->N * (params->N + 1)) / 2; size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); size_t lengthY = (1 + ((params->N -1) * abs(params->incy))); AP = new T[lengthA + params->offA ]; X = new T[lengthX + params->offBX ]; blasY = new T[lengthY + params->offCY ]; clblasY = new T[lengthY + params->offCY ]; srand(params->seed); ::std::cerr << "Generating input data... "; if((AP == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL)) { deleteBuffers<T>(AP, X, blasY, clblasY); ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; delete[] events; SUCCEED(); return; } alpha = convertMultiplier<T>(params->alpha); beta = convertMultiplier<T>(params->beta); randomHemvMatrices(params->order, params->uplo, params->N, true, &alpha, (AP + params->offA), params->lda, (X + params->offBX), params->incx, true, &beta, (blasY + params->offCY), params->incy); // Copy blasY to clblasY memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufAP = base->createEnqueueBuffer(AP, (lengthA + params->offA)* sizeof(*AP), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE); ::std::cerr << "Calling reference xHPMV routine... "; clblasOrder order; clblasUplo fUplo; order = params->order; fUplo = params->uplo; if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params->uplo == clblasUpper)? clblasLower : clblasUpper; doConjugate( (AP + params->offA), lengthA, 1, 1 ); } ::clMath::blas::hpmv( order, fUplo, params->N, alpha, AP, params->offA, X, params->offBX, params->incx, beta, blasY, params->offCY, params->incy); ::std::cerr << "Done" << ::std::endl; if ((bufAP == NULL) || (bufX == NULL) || (bufY == NULL)) { // Skip the test, the most probable reason is // matrix too big for a device. releaseMemObjects(bufAP, bufX, bufY); deleteBuffers<T>(AP, X, blasY, clblasY); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xHPMV routine... "; err = (cl_int)::clMath::clblas::hpmv(params->order, params->uplo, params->N, alpha, bufAP, params->offA, bufX, params->offBX, params->incx, beta, bufY, params->offCY, params->incy, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX, bufY); deleteBuffers<T>(AP, X, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HPMV() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX, bufY); deleteBuffers<T>(AP, X, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "HPMV: Reading results failed...." << std::endl; } releaseMemObjects(bufAP, bufX, bufY); compareMatrices<T>(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY), lengthY); deleteBuffers<T>(AP, X, blasY, clblasY); delete[] events; }
void sprCorrectnessTest(TestParams *params) { cl_int err; T *blasAP, *clblasAP, *X; // T *tempA; cl_mem bufAP, bufX; clMath::BlasBase *base; cl_event *events; bool useAlpha; T alpha; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthAP = ( ( params->N*( params->N + 1 ) )/2 ); size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); blasAP = new T[lengthAP + params->offa]; clblasAP = new T[lengthAP + params->offa]; X = new T[lengthX + params->offBX]; // tempA = new T[lengthA + params->offa ]; srand(params->seed); memset(blasAP, -1, (lengthAP + params->offa)); memset(clblasAP, -1, (lengthAP + params->offa)); memset(X, -1, (lengthX + params->offBX)); alpha = convertMultiplier<T>(params->alpha); useAlpha = true; #ifdef DEBUG_SPR printf("ALPHA in CORR_SPR.CPP %f\n", alpha); #endif if((blasAP == NULL) || (X == NULL) || (clblasAP == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers<T>(blasAP, clblasAP, X); delete[] events; SUCCEED(); return; } randomSyrMatrices<T>(params->order, params->uplo, params->N, useAlpha, &alpha, (blasAP + params->offa), 0, (X + params->offBX), params->incx); memcpy(clblasAP, blasAP, (lengthAP + params->offa)* sizeof(*blasAP)); bufAP = base->createEnqueueBuffer(clblasAP, (lengthAP + params->offa) * sizeof(*clblasAP), 0, CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); clblasOrder order; clblasUplo fUplo; order = params->order; fUplo = params->uplo; if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params->uplo == clblasUpper)? clblasLower : clblasUpper; if( params->transA == clblasConjTrans ) doConjugate( (blasAP +params->offa), (( params->N * (params->N + 1)) / 2) , 1, 1 ); } clMath::blas::spr( clblasColumnMajor, fUplo, params->N, alpha, X, params->offBX, params->incx, blasAP, params->offa); if ((bufAP == NULL) || (bufX == NULL) ) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufAP, bufX); deleteBuffers<T>(blasAP, clblasAP, X); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } err = (cl_int)::clMath::clblas::spr( params->order, params->uplo, params->N, alpha, bufX, params->offBX, params->incx, bufAP, params->offa, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX); deleteBuffers<T>(blasAP, clblasAP, X); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SYR() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX); deleteBuffers<T>(blasAP, clblasAP, X); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } err = clEnqueueReadBuffer(base->commandQueues()[0], bufAP, CL_TRUE, 0, (lengthAP + params->offa) * sizeof(*clblasAP), clblasAP, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "SPR: Reading results failed...." << std::endl; } releaseMemObjects(bufAP, bufX); compareMatrices<T>(clblasColumnMajor, lengthAP , 1, (blasAP + params->offa), (clblasAP + params->offa), lengthAP); deleteBuffers<T>(blasAP, clblasAP, X); delete[] events; }
template <typename ElemType> nano_time_t GercPerformanceTest<ElemType>::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; size_t lda; //int fIncx, fIncy; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif order = params_.order; lda = params_.lda; #ifdef PERF_TEST_WITH_ACML clblasOrder fOrder; size_t fN, fM; size_t fOffx, fOffy; int fIncx, fIncy; ElemType *fX, *fY; fOrder = params_.order; fM = params_.M; fN = params_.N; fIncx = params_.incx; fIncy = params_.incy; fX = x_; fY = y_; fOffx = params_.offBX; fOffy = params_.offCY; if (fOrder != clblasColumnMajor) { fOrder = clblasColumnMajor; doConjugate( (y_ + params_.offCY), (1 + (params_.N-1) * abs(params_.incy)), 1, 1 ); fM = params_.N; fN = params_.M; fX = y_; fY = x_; fIncx = params_.incy; fIncy = params_.incx; fOffx = params_.offCY; fOffy = params_.offBX; // Note this according to the Legacy guide time = getCurrentTime(); clMath::blas::ger(fOrder, fM, fN, alpha_, fX , fOffx, fIncx, fY, fOffy, fIncy, A_, params_.offa, params_.lda); } else{ time = getCurrentTime(); clMath::blas::gerc(order, fM, fN, alpha_, fX, fOffx, params_.incx, fY, fOffy, params_.incy, A_, params_.offa, lda); } time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML<F2> return time; }
void tpmvCorrectnessTest(TestParams *params) { cl_int err; T *AP, *blasX, *clblasX; cl_mem bufAP, bufX, bufXTemp; clMath::BlasBase *base; cl_event *events; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthAP = (params->N *( params->N + 1 ))/2 ; size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); AP = new T[lengthAP + params->offa ]; blasX = new T[lengthX + params->offBX ]; clblasX = new T[lengthX + params->offBX ]; if((AP == NULL) || (blasX == NULL) || (clblasX == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers<T>(AP, blasX, clblasX); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; // Set data in A and X using populate() routine int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT | PACKED_MATRIX; // Default is Column-Major creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); creationFlags = ( (params-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY); BlasRoutineID BlasFn = CLBLAS_TRMV; // Populate A and blasX populate( AP + params->offa, params-> N, params-> N, 0, BlasFn, creationFlags); populate( blasX , (lengthX + params->offBX), 1, (lengthX + params->offBX), BlasFn); // Copy blasX to clblasX memcpy(clblasX, blasX, (lengthX + params->offBX)* sizeof(*blasX)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufAP = base->createEnqueueBuffer(AP, (lengthAP + params->offa)* sizeof(*AP), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(clblasX, (lengthX + params->offBX)* sizeof(*clblasX), 0, CL_MEM_WRITE_ONLY); bufXTemp = base->createEnqueueBuffer(NULL, lengthX * sizeof(*clblasX), 0, CL_MEM_READ_ONLY); //printData( "bufX", blasX, lengthX, 1, lengthX); //printData( "clblasX", clblasX, lengthX, 1, lengthX); ::std::cerr << "Calling reference xTPMV routine... "; clblasOrder order; clblasUplo fUplo; clblasTranspose fTrans; order = params->order; fUplo = params->uplo; fTrans = params->transA; if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params->uplo == clblasUpper)? clblasLower : clblasUpper; fTrans = (params->transA == clblasNoTrans)? clblasTrans : clblasNoTrans; if( params->transA == clblasConjTrans ) doConjugate( (AP +params->offa), (( params->N * (params->N + 1)) / 2) , 1, 1 ); } ::clMath::blas::tpmv( order, fUplo, fTrans, params->diag, params->N, AP, params->offa, blasX, params->offBX, params->incx); ::std::cerr << "Done" << ::std::endl; // Hold X vector if ((bufAP == NULL) || (bufX == NULL) || (bufXTemp == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufAP, bufX, bufXTemp); deleteBuffers<T>(AP, blasX, clblasX); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xTPMV routine... "; DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE; // Should use bufXTemp as well err = (cl_int)::clMath::clblas::tpmv( type, params->order, params->uplo, params->transA, params->diag, params->N, bufAP, params->offa, bufX, params->offBX, params->incx, bufXTemp, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX, bufXTemp); deleteBuffers<T>(AP, blasX, clblasX); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TPMV() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX, bufXTemp); deleteBuffers<T>(AP, blasX, clblasX); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, (lengthX + params->offBX) * sizeof(*clblasX), clblasX, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "TPMV: Reading results failed...." << std::endl; } releaseMemObjects(bufAP, bufX, bufXTemp); compareMatrices<T>(clblasColumnMajor, lengthX , 1, (blasX + params->offBX), (clblasX + params->offBX), lengthX); deleteBuffers<T>(AP, blasX, clblasX); delete[] events; }
void gercCorrectnessTest(TestParams *params) { cl_int err; T *A, *x, *y, *backA; //size_t N, M; T alpha_; cl_mem bufA, bufx, bufy; clMath::BlasBase *base; cl_event *events; // int ka, kxy; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthA; if( params->order == clblasColumnMajor ) lengthA = params->N * params->lda; else lengthA = params->M * params->lda; size_t lengthx = (1 + (((params->M)-1) * abs(params->incx))); size_t lengthy = (1 + (((params->N)-1) * abs(params->incy))); bool useAlpha = base->useAlpha(); if (useAlpha) { alpha_ = convertMultiplier<T>(params->alpha); } A = new T[lengthA + params->offa]; x = new T[lengthx + params->offBX]; y = new T[lengthy + params->offCY]; backA = new T[lengthA + params->offa]; if((A == NULL) || (backA == NULL) || (x == NULL) || (y == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers<T>(A, x, y, backA); delete[] events; SUCCEED(); return; } srand(params->seed); int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT; creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); BlasRoutineID BlasFn = CLBLAS_GER; populate( (A + params->offa), params->M, params->N, params-> lda, BlasFn, creationFlags); populate( (x + params->offBX), lengthx, 1, lengthx, BlasFn ); populate( (y + params->offCY), lengthy, 1, lengthy, BlasFn ); // Copy C to backX memcpy(backA, A, (lengthA + params->offa) * sizeof(T)); // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offa) * sizeof(*A), 0, CL_MEM_READ_WRITE); bufx = base->createEnqueueBuffer(x, (lengthx + params->offBX) * sizeof(*x), 0, CL_MEM_READ_ONLY); bufy = base->createEnqueueBuffer(y, (lengthy + params->offCY) * sizeof(*y), 0, CL_MEM_READ_ONLY); clblasOrder fOrder; size_t fN, fM; size_t fOffx, fOffy; int fIncx, fIncy; T *fX, *fY; fOrder = params->order; fM = params->M; fN = params->N; fIncx = params->incx; fIncy = params->incy; fX = x; fY = y; fOffx = params->offBX; fOffy = params->offCY; if (fOrder != clblasColumnMajor) { doConjugate( (y + params->offCY), (1 + (params->N-1) * abs(params->incy)), 1, 1 ); fOrder = clblasColumnMajor; fM = params->N; fN = params->M; fX = y; fY = x; fIncx = params->incy; fIncy = params->incx; fOffx = params->offCY; fOffy = params->offBX; // Note this according to the Legacy guide clMath::blas::ger(fOrder, fM, fN, alpha_, fX , fOffx, fIncx, fY, fOffy, fIncy, A, params->offa, params->lda); } else { clMath::blas::gerc(fOrder, fM, fN, alpha_, fX , fOffx, fIncx, fY, fOffy, fIncy, A, params->offa, params->lda); } if ((bufA == NULL) || (bufx == NULL) || (bufy == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufx, bufy); deleteBuffers<T>(A, x, y, backA); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } err = (cl_int)::clMath::clblas::gerc( params->order, params->M, params->N, alpha_, bufx, params->offBX, params->incx, bufy, params->offCY, params->incy,bufA, params->offa, params->lda, params->numCommandQueues, base->commandQueues(), 0, NULL, events ); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufx, bufy); deleteBuffers<T>(A, x, y, backA); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::GER() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufx, bufy); deleteBuffers<T>(A, x, y, backA); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } clEnqueueReadBuffer(base->commandQueues()[0], bufA, CL_TRUE, 0, (lengthA + params->offa)* sizeof(*backA), backA, 0, NULL, NULL); releaseMemObjects(bufA, bufx, bufy); // handle lda correctly based on row-major/col-major.. compareMatrices<T>(params->order, params->M , params->N, A+ params->offa, backA + params->offa, params->lda); if (::testing::Test::HasFailure()) { printTestParams(params->order, params->M, params->N, useAlpha, base->alpha(), params->lda, params->incx, params->incy, params->offa, params->offBX, params->offCY); ::std::cerr << "seed = " << params->seed << ::std::endl; ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; } deleteBuffers<T>(A, x, y, backA); delete[] events; }