コード例 #1
0
ファイル: perf-hpmv.cpp プロジェクト: AndreasMiller/clBLAS
template <typename ElemType> nano_time_t
HpmvPerformanceTest<ElemType>::etalonPerfSingle(void)
{
    nano_time_t time = 0;
    clblasOrder order;
	clblasUplo fUplo;

#ifndef PERF_TEST_WITH_ROW_MAJOR
    if (params_.order == clblasRowMajor) {
        cerr << "Row major order is not allowed" << endl;
        return NANOTIME_ERR;
    }
#endif
    order = params_.order;
	fUplo = params_.uplo;

#ifdef PERF_TEST_WITH_ACML

	if (order != clblasColumnMajor)
    {
        order = clblasColumnMajor;
		fUplo =  (params_.uplo == clblasUpper)? clblasLower : clblasUpper;
		doConjugate( (AP_ + params_.offA), params_.N, params_.N, params_.lda );
        doConjugate( (AP_ + params_.offA), ((params_.N * (params_.N + 1)) / 2 ), 1, 1 );
   	}

   	time = getCurrentTime();
  	clMath::blas::hpmv(order, fUplo, params_.N, alpha, AP_, params_.offA,
							X_, params_.offBX, params_.incx, beta, Y_, params_.offCY, params_.incy);
  	time = getCurrentTime() - time;

#endif  // PERF_TEST_WITH_ACML

    return time;
}
コード例 #2
0
ファイル: perf-her2.cpp プロジェクト: AndreasMiller/clBLAS
template <typename ElemType> nano_time_t
Her2PerformanceTest<ElemType>::etalonPerfSingle(void)
{
	clblasOrder order;
    clblasUplo fUplo;
    nano_time_t time = 0;
    size_t lda;

#ifndef PERF_TEST_WITH_ROW_MAJOR
    if (params_.order == clblasRowMajor) {
        cerr << "Row major order is not allowed" << endl;
        return NANOTIME_ERR;
    }
#endif

    order = params_.order;
    lda = params_.lda;
    fUplo = params_.uplo;

#ifdef PERF_TEST_WITH_ACML

    ElemType *fX, *fY;
    int fIncx, fIncy;
    size_t fOffx, fOffy;
	fX = X_;    fOffx = params_.offBX;  fIncx = params_.incx;
	fY = Y_;    fOffy = params_.offCY;  fIncy = params_.incy;


	if (order != clblasColumnMajor)
    {
		doConjugate( (X_ + params_.offBX), (1 + (params_.N-1) * abs(params_.incx)), 1, 1 );
        doConjugate( (Y_ + params_.offCY), (1 + (params_.N-1) * abs(params_.incy)), 1, 1 );
        order = clblasColumnMajor;
        fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower;
	    fX = Y_;    fOffx = params_.offCY;  fIncx = params_.incy;
	    fY = X_;    fOffy = params_.offBX;  fIncy = params_.incx;
    }

   	time = getCurrentTime();
   	clMath::blas::her2(order, fUplo, params_.N, alpha_, fX, fOffx, fIncx, fY,
					fOffy, fIncy, A_, params_.offa, lda);
	time = getCurrentTime() - time;

#endif  // PERF_TEST_WITH_ACML

    return time;
}
コード例 #3
0
ファイル: perf-trsv.cpp プロジェクト: AndreasMiller/clBLAS
template <typename ElemType> nano_time_t
TrsvPerformanceTest<ElemType>::etalonPerfSingle(void)
{
    nano_time_t time = 0;
    clblasOrder order;
	clblasUplo fUplo;
    clblasTranspose fTrans;
    size_t lda;
#ifndef PERF_TEST_WITH_ROW_MAJOR
    if (params_.order == clblasRowMajor) {
        cerr << "Row major order is not allowed" << endl;
        return NANOTIME_ERR;
    }
#endif
    memcpy(X_, backX_, ((1 + ((params_.N-1) * abs(params_.incx)))+params_.offBX) * sizeof(ElemType));
    order = params_.order;
	fUplo = params_.uplo;
    fTrans = params_.transA;
    lda = params_.lda;

#ifdef PERF_TEST_WITH_ACML

	if (order != clblasColumnMajor)
    {
        order = clblasColumnMajor;
        fUplo =  (params_.uplo == clblasUpper)? clblasLower : clblasUpper;
        fTrans = (params_.transA == clblasNoTrans)? clblasTrans : clblasNoTrans;

        if( params_.transA == clblasConjTrans )
            doConjugate( A_ + params_.offa, params_.N, params_.N, lda );
    }

   time = getCurrentTime();
   clMath::blas::trsv(order, fUplo, fTrans, params_.diag,
                    params_.N, A_, params_.offa, lda, X_, params_.offBX, params_.incx);
   time = getCurrentTime() - time;

#endif  // PERF_TEST_WITH_ACML

    return time;
}
コード例 #4
0
ファイル: perf-gbmv.cpp プロジェクト: AndreasMiller/clBLAS
template <typename ElemType> nano_time_t
GbmvPerformanceTest<ElemType>::etalonPerfSingle(void)
{
    nano_time_t time = 0;
    clblasOrder fOrder;
    clblasTranspose fTrans;
    size_t lda, lenY, lenA;
    size_t fM = params_.M, fN = params_.N, fKL = params_.KL, fKU = params_.KU;

    lenA = ((params_.order == clblasColumnMajor)? params_.N : params_.M) * params_.lda;
    lenY = (((params_.transA == clblasNoTrans)? params_.M : params_.N) - 1)* params_.incy + 1 + params_.offCY;

    memcpy(Y_, backY_, lenY * sizeof(ElemType));
    fOrder = params_.order;
    fTrans = params_.transA;
    lda = params_.lda;

    if (fOrder != clblasColumnMajor)
    {
        fOrder = clblasColumnMajor;
        fTrans = (params_.transA == clblasNoTrans)? clblasTrans : clblasNoTrans;
        fM = params_.N;
        fN = params_.M;
        fKL = params_.KU;
        fKU = params_.KL;

		if( params_.transA == clblasConjTrans )
            doConjugate( (A_+params_.offa), 1, lenA, lda );
   	}

#ifdef PERF_TEST_WITH_ACML

   	time = getCurrentTime();
   	clMath::blas::gbmv(fOrder, fTrans, fM, fN, fKL, fKU, alpha, A_, params_.offA, lda,
							X_, params_.offBX, params_.incx, beta, Y_, params_.offCY, params_.incy);
  	time = getCurrentTime() - time;

#endif  // PERF_TEST_WITH_ACML

    return time;
}
コード例 #5
0
ファイル: perf-hpr.cpp プロジェクト: AndreasMiller/clBLAS
template <typename ElemType> nano_time_t
HprPerformanceTest<ElemType>::etalonPerfSingle(void)
{
    nano_time_t time = 0;
	clblasOrder order;
//	size_t lda;

#ifndef PERF_TEST_WITH_ROW_MAJOR
    if (params_.order == clblasRowMajor) {
        cerr << "Row major order is not allowed" << endl;
        return NANOTIME_ERR;
    }
#endif

    order = params_.order;


#ifdef PERF_TEST_WITH_ACML

    clblasOrder fOrder;
    clblasUplo fUplo;
    fOrder = params_.order;
	fUplo = params_.uplo;

	if (order != clblasColumnMajor)
    {
		doConjugate( (X_ + params_.offBX), (1 + (params_.N-1) * abs(params_.incx)), 1, 1 );
        fOrder = clblasColumnMajor;
        fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower;
    }

   	time = getCurrentTime();
   	clMath::blas::hpr(fOrder, fUplo, params_.N, CREAL(alpha_), X_, params_.offBX, params_.incx, AP_, params_.offa);
	time = getCurrentTime() - time;

#endif  // PERF_TEST_WITH_ACML

    return time;
}
コード例 #6
0
ファイル: corr-gbmv.cpp プロジェクト: clMathLibraries/clBLAS
void
gbmvCorrectnessTest(TestParams *params)
{
    cl_int err;
    T *A, *X, *blasY, *clblasY;
    cl_mem bufA, bufX, bufY;
    clMath::BlasBase *base;
    cl_event *events;
	T alpha, beta;
	size_t lengthX, lengthY, lengthA;

    base = clMath::BlasBase::getInstance();

    if (( (typeid(T) == typeid(DoubleComplex)) || (typeid(T) == typeid(cl_double)) ) &&
        !base->isDevSupportDoublePrecision()) {

        std::cerr << ">> WARNING: The target device doesn't support native "
                     "double precision floating point arithmetic" <<
                     std::endl << ">> Test skipped" << std::endl;
        SUCCEED();
        return;
    }

	printf("number of command queues : %d\n\n", params->numCommandQueues);

    events = new cl_event[params->numCommandQueues];
    memset(events, 0, params->numCommandQueues * sizeof(cl_event));

    lengthA = ((params->order == clblasColumnMajor)? params->N : params->M) * params->lda;

    if (params->transA == clblasNoTrans) {
        lengthX = (params->N - 1)*abs(params->incx) + 1;
        lengthY = (params->M - 1)*abs(params->incy) + 1;
    }
    else {
        lengthX = (params->M - 1)*abs(params->incx) + 1;
        lengthY = (params->N - 1)*abs(params->incy) + 1;
    }

    A 	= new T[lengthA + params->offA ];
    X 	= new T[lengthX + params->offBX ];
    blasY  		= new T[lengthY + params->offCY ];
	clblasY 	= new T[lengthY + params->offCY ];

    srand(params->seed);

	if((A == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL))
	{
		deleteBuffers<T>(A, X, blasY, clblasY);
		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
        delete[] events;
        SUCCEED();
        return;
	}

	alpha = convertMultiplier<T>(params->alpha);
	beta = convertMultiplier<T>(params->beta);

    randomGbmvMatrices(params->order, params->transA, params->M, params->N, &alpha, &beta,
                        (A + params->offA), params->lda, (X+params->offBX), params->incx, (blasY+params->offCY), params->incy );
    // Copy blasY to clblasY
    memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY));

	// Allocate buffers
    bufA = base->createEnqueueBuffer(A, (lengthA + params->offA)* sizeof(*A), 0, CL_MEM_READ_ONLY);
    bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY);
    bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE);

	clblasOrder fOrder;
	clblasTranspose fTrans;
	fOrder = params->order;
	fTrans = params->transA;
	size_t fM = params->M, fN = params->N, fKL = params->KL, fKU = params->KU;

	if (fOrder != clblasColumnMajor)
    {
        fOrder = clblasColumnMajor;
        fTrans = (params->transA == clblasNoTrans)? clblasTrans : clblasNoTrans;
        fM = params->N;
        fN = params->M;
        fKL = params->KU;
        fKU = params->KL;

		if( params->transA == clblasConjTrans )
            doConjugate( (A+params->offa), 1, lengthA, params->lda );
   	}
	clMath::blas::gbmv(fOrder, fTrans, fM, fN, fKL, fKU, alpha, A, params->offA, params->lda,
							X, params->offBX, params->incx, beta, blasY, params->offCY, params->incy);

    if ((bufA == NULL) || (bufX == NULL) || (bufY == NULL)) {
        // Skip the test, the most probable reason is
        //     matrix too big for a device.

        releaseMemObjects(bufA, bufX, bufY);
        deleteBuffers<T>(A, X, blasY, clblasY);
        delete[] events;
        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
            << ::std::endl
            << ">> Can't execute the test, because data is not transfered to GPU."
            << ::std::endl
            << ">> Test skipped." << ::std::endl;
        SUCCEED();
        return;
    }

    err = (cl_int)clMath::clblas::gbmv(params->order, params->transA, params->M, params->N, params->KL, params->KU,
                                        alpha, bufA, params->offA, params->lda, bufX, params->offBX, params->incx,
                                        beta, bufY, params->offCY, params->incy,
                                        params->numCommandQueues, base->commandQueues(), 0, NULL, events);

    if (err != CL_SUCCESS) {
        releaseMemObjects(bufA, bufX, bufY);
        deleteBuffers<T>(A, X, blasY, clblasY);
        delete[] events;
        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::GBMV() failed";
    }

    err = waitForSuccessfulFinish(params->numCommandQueues,
        base->commandQueues(), events);
    if (err != CL_SUCCESS) {
        releaseMemObjects(bufA, bufX, bufY);
        deleteBuffers<T>(A, X, blasY, clblasY);
        delete[] events;
        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
    }

    err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0,
        (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0,
        NULL, NULL);
	if (err != CL_SUCCESS)
	{
		::std::cerr << "GBMV: Reading results failed...." << std::endl;
	}

    releaseMemObjects(bufA, bufX, bufY);
    compareMatrices<T>(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY),
                       lengthY);

    if (::testing::Test::HasFailure())
    {
        printTestParams(params->order, params->transA, params->M, params->N, params->KL, params->KU, params->alpha, params->offA,
            params->lda, params->offBX, params->incx, params->beta, params->offCY, params->incy);
        ::std::cerr << "seed = " << params->seed << ::std::endl;
        ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl;
    }

    deleteBuffers<T>(A, X, blasY, clblasY);
    delete[] events;
}
コード例 #7
0
ファイル: corr-hpmv.cpp プロジェクト: AndreasMiller/clBLAS
void
hpmvCorrectnessTest(TestParams *params)
{
    cl_int err;
    T *AP, *X, *blasY, *clblasY;
    cl_mem bufAP, bufX, bufY;
    clMath::BlasBase *base;
    cl_event *events;
	T alpha, beta;

    base = clMath::BlasBase::getInstance();

    if ((typeid(T) == typeid(DoubleComplex)) &&
        !base->isDevSupportDoublePrecision()) {

        std::cerr << ">> WARNING: The target device doesn't support native "
                     "double precision floating point arithmetic" <<
                     std::endl << ">> Test skipped" << std::endl;
        SUCCEED();
        return;
    }

	printf("number of command queues : %d\n\n", params->numCommandQueues);

    events = new cl_event[params->numCommandQueues];
    memset(events, 0, params->numCommandQueues * sizeof(cl_event));

    size_t lengthA = (params->N * (params->N + 1)) / 2;
    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));
	size_t lengthY = (1 + ((params->N -1) * abs(params->incy)));

    AP 	= new T[lengthA + params->offA ];
    X 	= new T[lengthX + params->offBX ];
    blasY  		= new T[lengthY + params->offCY ];
	clblasY 	= new T[lengthY + params->offCY ];

    srand(params->seed);

    ::std::cerr << "Generating input data... ";

	if((AP == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL))
	{
		deleteBuffers<T>(AP, X, blasY, clblasY);
		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
        delete[] events;
        SUCCEED();
        return;
	}

	alpha = convertMultiplier<T>(params->alpha);
	beta = convertMultiplier<T>(params->beta);

	randomHemvMatrices(params->order, params->uplo, params->N, true, &alpha, (AP + params->offA), params->lda,
						(X + params->offBX), params->incx, true, &beta, (blasY + params->offCY), params->incy);
    // Copy blasY to clblasY
    memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY));
    ::std::cerr << "Done" << ::std::endl;

	// Allocate buffers
    bufAP = base->createEnqueueBuffer(AP, (lengthA + params->offA)* sizeof(*AP), 0, CL_MEM_READ_ONLY);
    bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY);
    bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE);

    ::std::cerr << "Calling reference xHPMV routine... ";

	clblasOrder order;
    clblasUplo fUplo;

	order = params->order;
    fUplo = params->uplo;

	if (order != clblasColumnMajor)
    {
        order = clblasColumnMajor;
        fUplo =  (params->uplo == clblasUpper)? clblasLower : clblasUpper;
		doConjugate( (AP + params->offA), lengthA, 1, 1 );
    }
	::clMath::blas::hpmv( order, fUplo, params->N, alpha, AP, params->offA, X, params->offBX, params->incx,
						beta, blasY, params->offCY, params->incy);
    ::std::cerr << "Done" << ::std::endl;

    if ((bufAP == NULL) || (bufX == NULL) || (bufY == NULL)) {
        // Skip the test, the most probable reason is
        //     matrix too big for a device.

        releaseMemObjects(bufAP, bufX, bufY);
        deleteBuffers<T>(AP, X, blasY, clblasY);
        delete[] events;
        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
            << ::std::endl
            << ">> Can't execute the test, because data is not transfered to GPU."
            << ::std::endl
            << ">> Test skipped." << ::std::endl;
        SUCCEED();
        return;
    }

    ::std::cerr << "Calling clblas xHPMV routine... ";

    err = (cl_int)::clMath::clblas::hpmv(params->order, params->uplo, params->N, alpha, bufAP,
    					params->offA, bufX, params->offBX, params->incx, beta, bufY, params->offCY, params->incy,
						params->numCommandQueues, base->commandQueues(), 0, NULL, events);

    if (err != CL_SUCCESS) {
        releaseMemObjects(bufAP, bufX, bufY);
        deleteBuffers<T>(AP, X, blasY, clblasY);
        delete[] events;
        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HPMV() failed";
    }

    err = waitForSuccessfulFinish(params->numCommandQueues,
        base->commandQueues(), events);
    if (err != CL_SUCCESS) {
        releaseMemObjects(bufAP, bufX, bufY);
        deleteBuffers<T>(AP, X, blasY, clblasY);
        delete[] events;
        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
    }
    ::std::cerr << "Done" << ::std::endl;


    err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0,
        (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0,
        NULL, NULL);
	if (err != CL_SUCCESS)
	{
		::std::cerr << "HPMV: Reading results failed...." << std::endl;
	}

    releaseMemObjects(bufAP, bufX, bufY);

    compareMatrices<T>(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY),
                       lengthY);
    deleteBuffers<T>(AP, X, blasY, clblasY);
    delete[] events;
}
コード例 #8
0
ファイル: corr-spr.cpp プロジェクト: clMathLibraries/clBLAS
void
sprCorrectnessTest(TestParams *params)
{
    cl_int err;
    T *blasAP, *clblasAP, *X;
//	T *tempA;
    cl_mem bufAP, bufX;
    clMath::BlasBase *base;
    cl_event *events;
	bool useAlpha;
	T alpha;

    base = clMath::BlasBase::getInstance();

    if ((typeid(T) == typeid(cl_double)) &&
        !base->isDevSupportDoublePrecision()) {

        std::cerr << ">> WARNING: The target device doesn't support native "
                     "double precision floating point arithmetic" <<
                     std::endl << ">> Test skipped" << std::endl;
        SUCCEED();
        return;
    }

	printf("number of command queues : %d\n\n", params->numCommandQueues);

    events = new cl_event[params->numCommandQueues];
    memset(events, 0, params->numCommandQueues * sizeof(cl_event));

    size_t lengthAP = ( ( params->N*( params->N + 1 ) )/2 );
    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));

    blasAP 		= new T[lengthAP + params->offa];
    clblasAP 	= new T[lengthAP + params->offa];
    X		 	= new T[lengthX + params->offBX];
//	tempA 		= new T[lengthA + params->offa ];

    srand(params->seed);

	memset(blasAP, -1, (lengthAP + params->offa));
	memset(clblasAP, -1, (lengthAP + params->offa));
	memset(X, -1, (lengthX + params->offBX));

	alpha =  convertMultiplier<T>(params->alpha);
	useAlpha = true;

	#ifdef DEBUG_SPR
	printf("ALPHA in CORR_SPR.CPP %f\n", alpha);
	#endif

	if((blasAP == NULL) || (X == NULL) || (clblasAP == NULL))
    {
        ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
		deleteBuffers<T>(blasAP, clblasAP, X);
        delete[] events;
        SUCCEED();
        return;
    }

	randomSyrMatrices<T>(params->order, params->uplo, params->N, useAlpha, &alpha,
						(blasAP + params->offa), 0, (X + params->offBX), params->incx);

    memcpy(clblasAP, blasAP, (lengthAP + params->offa)* sizeof(*blasAP));

    bufAP = base->createEnqueueBuffer(clblasAP, (lengthAP + params->offa) * sizeof(*clblasAP), 0, CL_MEM_READ_WRITE);
    bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY);

	clblasOrder order;
    clblasUplo fUplo;
	order = params->order;
    fUplo = params->uplo;


	if (order != clblasColumnMajor)
    {

        order = clblasColumnMajor;
        fUplo =  (params->uplo == clblasUpper)? clblasLower : clblasUpper;

        if( params->transA == clblasConjTrans )
            doConjugate( (blasAP +params->offa), (( params->N * (params->N + 1)) / 2) , 1, 1 );

    }

    clMath::blas::spr( clblasColumnMajor, fUplo, params->N, alpha, X, params->offBX, params->incx, blasAP, params->offa);

    if ((bufAP == NULL) || (bufX == NULL) ) {
        /* Skip the test, the most probable reason is
         *     matrix too big for a device.
         */
        releaseMemObjects(bufAP, bufX);
        deleteBuffers<T>(blasAP, clblasAP, X);
        delete[] events;
        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
            << ::std::endl
            << ">> Can't execute the test, because data is not transfered to GPU."
            << ::std::endl
            << ">> Test skipped." << ::std::endl;
        SUCCEED();
        return;
    }

    err = (cl_int)::clMath::clblas::spr( params->order, params->uplo, params->N, alpha,
						bufX, params->offBX, params->incx, bufAP, params->offa,
						params->numCommandQueues, base->commandQueues(),
    					0, NULL, events);

    if (err != CL_SUCCESS) {
        releaseMemObjects(bufAP, bufX);
        deleteBuffers<T>(blasAP, clblasAP, X);
        delete[] events;
        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SYR() failed";
    }

    err = waitForSuccessfulFinish(params->numCommandQueues,
        base->commandQueues(), events);
    if (err != CL_SUCCESS) {
        releaseMemObjects(bufAP, bufX);
        deleteBuffers<T>(blasAP, clblasAP, X);
        delete[] events;
        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
    }

    err = clEnqueueReadBuffer(base->commandQueues()[0], bufAP, CL_TRUE, 0,
        (lengthAP + params->offa) * sizeof(*clblasAP), clblasAP, 0,
        NULL, NULL);
	if (err != CL_SUCCESS)
	{
		::std::cerr << "SPR: Reading results failed...." << std::endl;
	}

    releaseMemObjects(bufAP, bufX);
	compareMatrices<T>(clblasColumnMajor, lengthAP , 1, (blasAP + params->offa), (clblasAP + params->offa),
                       lengthAP);

	deleteBuffers<T>(blasAP, clblasAP, X);
    delete[] events;
}
コード例 #9
0
ファイル: perf-gerc.cpp プロジェクト: AndreasMiller/clBLAS
template <typename ElemType> nano_time_t
GercPerformanceTest<ElemType>::etalonPerfSingle(void)
{
    nano_time_t time = 0;
    clblasOrder order;
    size_t lda;
    //int fIncx, fIncy;

#ifndef PERF_TEST_WITH_ROW_MAJOR
    if (params_.order == clblasRowMajor) {
        cerr << "Row major order is not allowed" << endl;
        return NANOTIME_ERR;
    }
#endif

    order = params_.order;
    lda = params_.lda;

#ifdef PERF_TEST_WITH_ACML

	 clblasOrder fOrder;
    size_t fN, fM;
    size_t fOffx, fOffy;
    int fIncx, fIncy;
    ElemType  *fX, *fY;
    fOrder = params_.order;
    fM = params_.M;
    fN = params_.N;
    fIncx = params_.incx;
    fIncy = params_.incy;
    fX = x_;
    fY = y_;
    fOffx = params_.offBX;
    fOffy = params_.offCY;

    if (fOrder != clblasColumnMajor) {
           fOrder = clblasColumnMajor;

		   doConjugate( (y_ + params_.offCY), (1 + (params_.N-1) * abs(params_.incy)), 1, 1 );
           fM = params_.N;
           fN = params_.M;
           fX = y_;
           fY = x_;
           fIncx = params_.incy;
           fIncy = params_.incx;
           fOffx = params_.offCY;
           fOffy = params_.offBX;
		   // Note this according to the Legacy guide
		   time = getCurrentTime();
			clMath::blas::ger(fOrder, fM, fN, alpha_, fX , fOffx, fIncx, fY, fOffy, fIncy,  A_, params_.offa, params_.lda);
       }
	else{
		time = getCurrentTime();
		clMath::blas::gerc(order, fM, fN, alpha_, fX, fOffx, params_.incx, fY, fOffy, params_.incy,  A_, params_.offa, lda);
	}
    time = getCurrentTime() - time;

#endif  // PERF_TEST_WITH_ACML<F2>

    return time;
}
コード例 #10
0
ファイル: corr-tpmv.cpp プロジェクト: AndreasMiller/clBLAS
void
tpmvCorrectnessTest(TestParams *params)
{
    cl_int err;
    T *AP, *blasX, *clblasX;
    cl_mem bufAP, bufX, bufXTemp;
    clMath::BlasBase *base;
    cl_event *events;

    base = clMath::BlasBase::getInstance();

    if ((typeid(T) == typeid(cl_double) ||
         typeid(T) == typeid(DoubleComplex)) &&
        !base->isDevSupportDoublePrecision()) {

        std::cerr << ">> WARNING: The target device doesn't support native "
                     "double precision floating point arithmetic" <<
                     std::endl << ">> Test skipped" << std::endl;
        SUCCEED();
        return;
    }

	printf("number of command queues : %d\n\n", params->numCommandQueues);

    events = new cl_event[params->numCommandQueues];
    memset(events, 0, params->numCommandQueues * sizeof(cl_event));

    size_t lengthAP = (params->N *( params->N + 1 ))/2 ;
    size_t lengthX = (1 + ((params->N -1) * abs(params->incx)));

    AP 		= new T[lengthAP + params->offa ];
    blasX 	= new T[lengthX + params->offBX ];
    clblasX = new T[lengthX + params->offBX ];

	if((AP == NULL) || (blasX == NULL) || (clblasX == NULL))
	{
		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
        deleteBuffers<T>(AP, blasX, clblasX);
		delete[] events;
		SUCCEED();
        return;
	}

    srand(params->seed);

    ::std::cerr << "Generating input data... ";

    // Set data in A and X using populate() routine
    int creationFlags = 0;
    creationFlags =  creationFlags | RANDOM_INIT | PACKED_MATRIX;

    // Default is Column-Major
    creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
    creationFlags = ( (params-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY);
	BlasRoutineID BlasFn = CLBLAS_TRMV;

    // Populate A and blasX
    populate( AP + params->offa, params-> N, params-> N, 0, BlasFn, creationFlags);
    populate( blasX , (lengthX + params->offBX), 1, (lengthX + params->offBX), BlasFn);

    // Copy blasX to clblasX
    memcpy(clblasX, blasX, (lengthX + params->offBX)* sizeof(*blasX));
    ::std::cerr << "Done" << ::std::endl;

	// Allocate buffers
    bufAP = base->createEnqueueBuffer(AP, (lengthAP + params->offa)* sizeof(*AP), 0, CL_MEM_READ_ONLY);
    bufX = base->createEnqueueBuffer(clblasX, (lengthX + params->offBX)* sizeof(*clblasX), 0, CL_MEM_WRITE_ONLY);
    bufXTemp = base->createEnqueueBuffer(NULL, lengthX * sizeof(*clblasX), 0, CL_MEM_READ_ONLY);

	//printData( "bufX", blasX, lengthX, 1, lengthX);
	//printData( "clblasX", clblasX, lengthX, 1, lengthX);

    ::std::cerr << "Calling reference xTPMV routine... ";


	clblasOrder order;
    clblasUplo fUplo;
    clblasTranspose fTrans;

	order = params->order;
    fUplo = params->uplo;
    fTrans = params->transA;

	if (order != clblasColumnMajor)
    {
        order = clblasColumnMajor;
        fUplo =  (params->uplo == clblasUpper)? clblasLower : clblasUpper;
        fTrans = (params->transA == clblasNoTrans)? clblasTrans : clblasNoTrans;

        if( params->transA == clblasConjTrans )
            doConjugate( (AP +params->offa), (( params->N * (params->N + 1)) / 2) , 1, 1 );
    }

	::clMath::blas::tpmv( order, fUplo, fTrans, params->diag, params->N, AP, params->offa, blasX, params->offBX, params->incx);
    ::std::cerr << "Done" << ::std::endl;

    // Hold X vector

    if ((bufAP == NULL) || (bufX == NULL) || (bufXTemp == NULL)) {
        /* Skip the test, the most probable reason is
         *     matrix too big for a device.
         */
        releaseMemObjects(bufAP, bufX, bufXTemp);
        deleteBuffers<T>(AP, blasX, clblasX);
        delete[] events;
        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
            << ::std::endl
            << ">> Can't execute the test, because data is not transfered to GPU."
            << ::std::endl
            << ">> Test skipped." << ::std::endl;
        SUCCEED();
        return;
    }

    ::std::cerr << "Calling clblas xTPMV routine... ";

    DataType type;
    type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE;

    // Should use bufXTemp as well
    err = (cl_int)::clMath::clblas::tpmv( type, params->order, params->uplo, params->transA, params->diag, params->N, bufAP,
    					params->offa, bufX, params->offBX, params->incx, bufXTemp, params->numCommandQueues, base->commandQueues(),
    					0, NULL, events);

    if (err != CL_SUCCESS) {
        releaseMemObjects(bufAP, bufX, bufXTemp);
        deleteBuffers<T>(AP, blasX, clblasX);
        delete[] events;
        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TPMV() failed";
    }

    err = waitForSuccessfulFinish(params->numCommandQueues,
        base->commandQueues(), events);
    if (err != CL_SUCCESS) {
        releaseMemObjects(bufAP, bufX, bufXTemp);
        deleteBuffers<T>(AP, blasX, clblasX);
        delete[] events;
        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
    }
    ::std::cerr << "Done" << ::std::endl;


    err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0,
        (lengthX + params->offBX) * sizeof(*clblasX), clblasX, 0,
        NULL, NULL);
	if (err != CL_SUCCESS)
	{
		::std::cerr << "TPMV: Reading results failed...." << std::endl;
	}

    releaseMemObjects(bufAP, bufX, bufXTemp);

    compareMatrices<T>(clblasColumnMajor, lengthX , 1, (blasX + params->offBX), (clblasX + params->offBX),
                       lengthX);
    deleteBuffers<T>(AP, blasX, clblasX);
    delete[] events;
}
コード例 #11
0
ファイル: corr-gerc.cpp プロジェクト: clMathLibraries/clBLAS
void
gercCorrectnessTest(TestParams *params)
{
    cl_int err;
    T *A, *x, *y, *backA;
    //size_t N, M;

    T alpha_;
    cl_mem bufA, bufx, bufy;
    clMath::BlasBase *base;
    cl_event *events;
//	int ka, kxy;

    base = clMath::BlasBase::getInstance();

    if ((typeid(T) == typeid(cl_double) ||
         typeid(T) == typeid(DoubleComplex)) &&
        !base->isDevSupportDoublePrecision()) {

        std::cerr << ">> WARNING: The target device doesn't support native "
                     "double precision floating point arithmetic" <<
                     std::endl << ">> Test skipped" << std::endl;
        SUCCEED();
        return;
    }

    events = new cl_event[params->numCommandQueues];
    memset(events, 0, params->numCommandQueues * sizeof(cl_event));

	size_t lengthA;
	if( params->order == clblasColumnMajor )
	lengthA = params->N  * params->lda;
	else
	lengthA = params->M  * params->lda;

    size_t lengthx = (1 + (((params->M)-1) * abs(params->incx)));
    size_t lengthy = (1 + (((params->N)-1) * abs(params->incy)));

    bool useAlpha = base->useAlpha();

    if (useAlpha) {
        alpha_ = convertMultiplier<T>(params->alpha);
    }


    A 		= new T[lengthA + params->offa];
    x   	= new T[lengthx + params->offBX];
    y   	= new T[lengthy + params->offCY];
    backA       = new T[lengthA + params->offa];

	if((A == NULL) || (backA == NULL) || (x == NULL) || (y == NULL))
	{
		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
        deleteBuffers<T>(A, x, y, backA);
		delete[] events;
		SUCCEED();
        return;
	}
    srand(params->seed);

    int creationFlags = 0;
    creationFlags =  creationFlags | RANDOM_INIT;
    creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
	BlasRoutineID BlasFn = CLBLAS_GER;

    populate( (A + params->offa), params->M, params->N, params-> lda, BlasFn, creationFlags);
    populate( (x + params->offBX), lengthx, 1, lengthx, BlasFn );
    populate( (y + params->offCY), lengthy, 1, lengthy, BlasFn );

    // Copy C to backX
    memcpy(backA, A, (lengthA + params->offa) * sizeof(T));

	// Allocate buffers
    bufA = base->createEnqueueBuffer(A, (lengthA + params->offa) * sizeof(*A), 0, CL_MEM_READ_WRITE);
    bufx = base->createEnqueueBuffer(x, (lengthx + params->offBX) * sizeof(*x), 0, CL_MEM_READ_ONLY);
    bufy = base->createEnqueueBuffer(y, (lengthy + params->offCY) * sizeof(*y), 0, CL_MEM_READ_ONLY);


    clblasOrder fOrder;
    size_t fN, fM;
    size_t fOffx, fOffy;
    int fIncx, fIncy;
    T *fX, *fY;
    fOrder = params->order;
    fM = params->M;
    fN = params->N;
    fIncx = params->incx;
    fIncy = params->incy;
    fX = x;
    fY = y;
    fOffx = params->offBX;
    fOffy = params->offCY;

    if (fOrder != clblasColumnMajor) {

        doConjugate( (y + params->offCY), (1 + (params->N-1) * abs(params->incy)), 1, 1 );
		fOrder = clblasColumnMajor;
        fM = params->N;
        fN = params->M;
        fX = y;
        fY = x;
        fIncx = params->incy;
        fIncy = params->incx;
        fOffx = params->offCY;
        fOffy = params->offBX;
		// Note this according to the Legacy guide
		clMath::blas::ger(fOrder, fM, fN, alpha_, fX , fOffx, fIncx, fY, fOffy, fIncy,  A, params->offa, params->lda);
    }
	else {
		clMath::blas::gerc(fOrder, fM, fN, alpha_, fX , fOffx, fIncx, fY, fOffy, fIncy,  A, params->offa, params->lda);
	}

    if ((bufA == NULL) || (bufx == NULL) || (bufy == NULL)) {
        /* Skip the test, the most probable reason is
         *     matrix too big for a device.
         */
        releaseMemObjects(bufA, bufx, bufy);
        deleteBuffers<T>(A, x, y, backA);
        delete[] events;
        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
            << ::std::endl
            << ">> Can't execute the test, because data is not transfered to GPU."
            << ::std::endl
            << ">> Test skipped." << ::std::endl;
        SUCCEED();
        return;
    }

    err = (cl_int)::clMath::clblas::gerc( params->order, params->M, params->N, alpha_,
                            bufx, params->offBX, params->incx, bufy, params->offCY, params->incy,bufA, params->offa, params->lda,
							params->numCommandQueues, base->commandQueues(), 0, NULL, events );

    if (err != CL_SUCCESS) {
       	releaseMemObjects(bufA, bufx, bufy);
        deleteBuffers<T>(A, x, y, backA);
        delete[] events;
        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::GER() failed";
    }

    err = waitForSuccessfulFinish(params->numCommandQueues,
        base->commandQueues(), events);
    if (err != CL_SUCCESS) {

       releaseMemObjects(bufA, bufx, bufy);
        deleteBuffers<T>(A, x, y, backA);
        delete[] events;
        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
    }

    clEnqueueReadBuffer(base->commandQueues()[0], bufA, CL_TRUE, 0,
        (lengthA + params->offa)* sizeof(*backA), backA, 0,
        NULL, NULL);

    releaseMemObjects(bufA, bufx, bufy);

    // handle lda correctly based on row-major/col-major..
    compareMatrices<T>(params->order, params->M , params->N, A+ params->offa, backA + params->offa, params->lda);

    if (::testing::Test::HasFailure())
    {
        printTestParams(params->order, params->M, params->N, useAlpha,
            base->alpha(),
            params->lda, params->incx, params->incy, params->offa, params->offBX, params->offCY);

        ::std::cerr << "seed = " << params->seed << ::std::endl;
        ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl;
    }

    deleteBuffers<T>(A, x, y, backA);
    delete[] events;
}