inline static void magma_clarfxsym_v2( magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *V, magmaFloatComplex *TAU, magmaFloatComplex *work) { /* WORK (workspace) float complex array, dimension N */ magma_int_t ione = 1; magmaFloatComplex dtmp; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_neg_one= MAGMA_C_NEG_ONE; magmaFloatComplex c_half = MAGMA_C_HALF; /* X = AVtau */ blasf77_chemv("L",&n, TAU, A, &lda, V, &ione, &c_zero, work, &ione); /* compute dtmp= X'*V */ dtmp = magma_cblas_cdotc(n, work, ione, V, ione); /* compute 1/2 X'*V*t = 1/2*dtmp*tau */ dtmp = -dtmp * c_half * (*TAU); /* compute W=X-1/2VX'Vt = X - dtmp*V */ blasf77_caxpy(&n, &dtmp, V, &ione, work, &ione); /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */ blasf77_cher2("L", &n, &c_neg_one, work, &ione, V, &ione, A, &lda); }
extern "C" void magma_clarfxsym( magma_int_t N, magmaFloatComplex *A, magma_int_t LDA, magmaFloatComplex *V, magmaFloatComplex *TAU) { magma_int_t IONE=1; magmaFloatComplex dtmp; magmaFloatComplex Z_ZERO = MAGMA_C_ZERO; //magmaFloatComplex Z_ONE = MAGMA_C_ONE; magmaFloatComplex Z_MONE = MAGMA_C_NEG_ONE; magmaFloatComplex Z_HALF = MAGMA_C_HALF; //magmaFloatComplex WORK[N]; magmaFloatComplex *WORK; magma_cmalloc_cpu( &WORK, N ); /* apply left and right on A(st:ed,st:ed)*/ //magma_clarfxsym(len,A(st,st),LDX,V(st),TAU(st)); /* X = AVtau */ blasf77_chemv("L",&N, TAU, A, &LDA, V, &IONE, &Z_ZERO, WORK, &IONE); /* je calcul dtmp= X'*V */ dtmp = magma_cblas_cdotc(N, WORK, IONE, V, IONE); /* je calcul 1/2 X'*V*t = 1/2*dtmp*tau */ dtmp = -dtmp * Z_HALF * (*TAU); /* je calcul W=X-1/2VX'Vt = X - dtmp*V */ /* for (j = 0; j < N; j++) WORK[j] = WORK[j] + (dtmp*V[j]); */ blasf77_caxpy(&N, &dtmp, V, &IONE, WORK, &IONE); /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */ blasf77_cher2("L",&N,&Z_MONE,WORK,&IONE,V,&IONE,A,&LDA); magma_free_cpu(WORK); }
inline static void magma_clarfxsym_v2(magma_int_t n, magmaFloatComplex *A, magma_int_t lda, magmaFloatComplex *V, magmaFloatComplex *TAU, magmaFloatComplex *work) { /* WORK (workspace) float complex array, dimension N */ magma_int_t ione = 1; magmaFloatComplex dtmp; magmaFloatComplex c_zero = MAGMA_C_ZERO; magmaFloatComplex c_neg_one= MAGMA_C_NEG_ONE; magmaFloatComplex c_half = MAGMA_C_HALF; /* X = AVtau */ blasf77_chemv("L",&n, TAU, A, &lda, V, &ione, &c_zero, work, &ione); /* compute dtmp= X'*V */ #if defined(PRECISION_z) || defined(PRECISION_c) dtmp = c_zero; for (magma_int_t j = 0; j < n; j++) dtmp = dtmp + MAGMA_C_CNJG(work[j]) * V[j]; //cblas_cdotc_sub(n, work, ione, V, ione, &dtmp); #else dtmp = cblas_cdotc(n, work, ione, V, ione); #endif /* compute 1/2 X'*V*t = 1/2*dtmp*tau */ dtmp = -dtmp * c_half * (*TAU); /* compute W=X-1/2VX'Vt = X - dtmp*V */ blasf77_caxpy(&n, &dtmp, V, &ione, work, &ione); /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */ blasf77_cher2("L", &n, &c_neg_one, work, &ione, V, &ione, A, &lda); }
int main(int argc, char **argv) { TESTING_INIT(); real_Double_t gflops, magma_perf, magma_time, cublas_perf, cublas_time, cpu_perf, cpu_time; float magma_error, cublas_error, work[1]; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t N, lda, sizeA, sizeX, sizeY, blocks, ldwork; magma_int_t incx = 1; magma_int_t incy = 1; magma_int_t nb = 64; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex alpha = MAGMA_C_MAKE( 1.5, -2.3 ); magmaFloatComplex beta = MAGMA_C_MAKE( -0.6, 0.8 ); magmaFloatComplex *A, *X, *Y, *Ycublas, *Ymagma; magmaFloatComplex *dA, *dX, *dY, *dC_work; magma_opts opts; parse_opts( argc, argv, &opts ); printf(" N MAGMA Gflop/s (ms) CUBLAS Gflop/s (ms) CPU Gflop/s (ms) MAGMA error CUBLAS error\n"); printf("=============================================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[i]; lda = ((N + 31)/32)*32; sizeA = N*lda; sizeX = N*incx; sizeY = N*incy; gflops = FLOPS_CHEMV( N ) / 1e9; TESTING_MALLOC( A, magmaFloatComplex, sizeA ); TESTING_MALLOC( X, magmaFloatComplex, sizeX ); TESTING_MALLOC( Y, magmaFloatComplex, sizeY ); TESTING_MALLOC( Ycublas, magmaFloatComplex, sizeY ); TESTING_MALLOC( Ymagma, magmaFloatComplex, sizeY ); TESTING_DEVALLOC( dA, magmaFloatComplex, sizeA ); TESTING_DEVALLOC( dX, magmaFloatComplex, sizeX ); TESTING_DEVALLOC( dY, magmaFloatComplex, sizeY ); blocks = (N + nb - 1) / nb; ldwork = lda * (blocks + 1); TESTING_DEVALLOC( dC_work, magmaFloatComplex, ldwork ); /* Initialize the matrix */ lapackf77_clarnv( &ione, ISEED, &sizeA, A ); magma_cmake_hermitian( N, A, lda ); lapackf77_clarnv( &ione, ISEED, &sizeX, X ); lapackf77_clarnv( &ione, ISEED, &sizeY, Y ); /* ===================================================================== Performs operation using CUBLAS =================================================================== */ magma_csetmatrix( N, N, A, lda, dA, lda ); magma_csetvector( N, X, incx, dX, incx ); magma_csetvector( N, Y, incy, dY, incy ); cublas_time = magma_sync_wtime( 0 ); cublasChemv( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy ); cublas_time = magma_sync_wtime( 0 ) - cublas_time; cublas_perf = gflops / cublas_time; magma_cgetvector( N, dY, incy, Ycublas, incy ); /* ===================================================================== Performs operation using MAGMA BLAS =================================================================== */ magma_csetvector( N, Y, incy, dY, incy ); magma_time = magma_sync_wtime( 0 ); #if (GPUSHMEM >= 200) magmablas_chemv2( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy, dC_work, ldwork ); #else magmablas_chemv( opts.uplo, N, alpha, dA, lda, dX, incx, beta, dY, incy ); #endif magma_time = magma_sync_wtime( 0 ) - magma_time; magma_perf = gflops / magma_time; magma_cgetvector( N, dY, incy, Ymagma, incy ); /* ===================================================================== Performs operation using CPU BLAS =================================================================== */ cpu_time = magma_wtime(); blasf77_chemv( &opts.uplo, &N, &alpha, A, &lda, X, &incx, &beta, Y, &incy ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; /* ===================================================================== Check the result =================================================================== */ blasf77_caxpy( &N, &c_neg_one, Y, &incy, Ymagma, &incy ); magma_error = lapackf77_clange( "M", &N, &ione, Ymagma, &N, work ) / N; blasf77_caxpy( &N, &c_neg_one, Y, &incy, Ycublas, &incy ); cublas_error = lapackf77_clange( "M", &N, &ione, Ycublas, &N, work ) / N; printf("%5d %7.2f (%7.2f) %7.2f (%7.2f) %7.2f (%7.2f) %8.2e %8.2e\n", (int) N, magma_perf, 1000.*magma_time, cublas_perf, 1000.*cublas_time, cpu_perf, 1000.*cpu_time, magma_error, cublas_error ); TESTING_FREE( A ); TESTING_FREE( X ); TESTING_FREE( Y ); TESTING_FREE( Ycublas ); TESTING_FREE( Ymagma ); TESTING_DEVFREE( dA ); TESTING_DEVFREE( dX ); TESTING_DEVFREE( dY ); TESTING_DEVFREE( dC_work ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }