Beispiel #1
0
int main(int argc, char **argv)
{        
    TESTING_CUDA_INIT();

    magma_timestr_t  start, end;
    double      flops, magma_perf, cuda_perf, error, work[1];
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    cuDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;

    FILE        *fp ; 
    magma_int_t i, lda, Xm, Ym;
    magma_int_t M, M0 = 0;
    magma_int_t N, N0 = 0;
    magma_int_t szeA, szeX, szeY;
    magma_int_t istart = 64;
    magma_int_t iend   = 10240;
    magma_int_t incx = 1;
    magma_int_t incy = 1;
    char        trans = MagmaNoTrans;
    cuDoubleComplex alpha = MAGMA_Z_MAKE(1., 0.); // MAGMA_Z_MAKE(  1.5, -2.3 );
    cuDoubleComplex beta  = MAGMA_Z_MAKE(0., 0.); // MAGMA_Z_MAKE( -0.6,  0.8 );
    cuDoubleComplex *A, *X, *Y, *Ycublas, *Ymagma;
    cuDoubleComplex *dA, *dX, *dY;
        
    if (argc != 1){
        for(i=1; i<argc; i++){
            if ( strcmp("-n", argv[i]) == 0 ){
                N0 = atoi(argv[++i]);
            }
            else if ( strcmp("-m", argv[i]) == 0 ){
                M0 = atoi(argv[++i]);
            }
            else if (strcmp("-N", argv[i])==0){
                trans = MagmaNoTrans;
            }
            else if (strcmp("-T", argv[i])==0){
                trans = MagmaTrans;
            }
#if defined(PRECISION_z) || defined(PRECISION_c)
            else if (strcmp("-C", argv[i])==0){
                trans = MagmaConjTrans;
            }
#endif
        }
    }

    if ( (M0 != 0) && (N0 != 0) )
        iend = istart + 1;

    M = N = iend;
    if ( M0 != 0 ) M = M0;
    if ( N0 != 0 ) N = N0;

    if( trans == MagmaNoTrans ) {
        Xm = N;
        Ym = M;
    }  else {
        Xm = M;
        Ym = N;
    }

    lda = ((M+31)/32)*32;
    
    szeA = lda*N;
    szeX = incx*Xm;
    szeY = incy*Ym;
      
    TESTING_MALLOC( A, cuDoubleComplex, szeA );
    TESTING_MALLOC( X, cuDoubleComplex, szeX );
    TESTING_MALLOC( Y, cuDoubleComplex, szeY );
    TESTING_MALLOC( Ycublas, cuDoubleComplex, szeY );
    TESTING_MALLOC( Ymagma,  cuDoubleComplex, szeY );

    TESTING_DEVALLOC( dA, cuDoubleComplex, szeA );
    TESTING_DEVALLOC( dX, cuDoubleComplex, szeX );
    TESTING_DEVALLOC( dY, cuDoubleComplex, szeY );

    /* Initialize the matrix */
    lapackf77_zlarnv( &ione, ISEED, &szeA, A );
    lapackf77_zlarnv( &ione, ISEED, &szeX, X );
    lapackf77_zlarnv( &ione, ISEED, &szeY, Y );

    fp = fopen ("results_zgemv.txt", "w") ;
    if( fp == NULL ){ printf("Couldn't open output file\n"); exit(1);}

    printf("\nUsage: \n");
    printf("  testing_zgemv [-N|T|C] [-m %d] [-n %d]\n\n", 1024, 1024);

    printf( "   m    n   CUBLAS,Gflop/s   MAGMABLAS Gflop/s   \"error\"\n" 
            "==============================================================\n");
    fprintf(fp, "   m    n   CUBLAS,Gflop/s   MAGMABLAS Gflop/s   \"error\"\n" 
            "==============================================================\n");
    
    for( i=istart; i < iend; i = (int)((i+1)*1.1) )
    {
        M = N = i;
        if ( M0 != 0 ) M = M0;
        if ( N0 != 0 ) N = N0;

        if( trans == MagmaNoTrans ) {
            Xm = N;
            Ym = M;
        }  else {
            Xm = M;
            Ym = N;
        }
         
        lda = ((M+31)/32)*32;
        flops = FLOPS( (double)M, (double)N ) / 1000000;

        printf(      "%5d %5d ", (int) M, (int) N );
        fprintf( fp, "%5d %5d ", (int) M, (int) N );

        /* =====================================================================
           Performs operation using CUDA-BLAS
           =================================================================== */
        magma_zsetmatrix( M, N, A, lda, dA, lda );
        magma_zsetvector( Xm, X, incx, dX, incx );
        magma_zsetvector( Ym, Y, incy, dY, incy );

        /*
         * Cublas Version
         */
        start = get_current_time();
        cublasZgemv( trans, M, N, alpha, dA, lda, dX, incx, beta, dY, incy );
        end = get_current_time();
        
        magma_zgetvector( Ym, dY, incy, Ycublas, incy );
        
        cuda_perf = flops / GetTimerValue(start, end);
        printf(     "%11.2f", cuda_perf );
        fprintf(fp, "%11.2f", cuda_perf );

        /*
         * Magma Version
         */
        magma_zsetvector( Ym, Y, incy, dY, incy );
        
        start = get_current_time();
        magmablas_zgemv( trans, M, N, alpha, dA, lda, dX, incx, beta, dY, incy );
        end = get_current_time();
        
        magma_zgetvector( Ym, dY, incx, Ymagma, incx );
        
        magma_perf = flops / GetTimerValue(start, end);
        printf(     "%11.2f", magma_perf );
        fprintf(fp, "%11.2f", magma_perf );

        /* =====================================================================
           Computing the Difference Cublas VS Magma
           =================================================================== */
        
        blasf77_zaxpy( &Ym, &c_neg_one, Ymagma, &incy, Ycublas, &incy);
        error = lapackf77_zlange( "M", &Ym, &ione, Ycublas, &Ym, work );

#if 0
        printf(      "\t\t %8.6e", error / (double)Ym );
        fprintf( fp, "\t\t %8.6e", error / (double)Ym );

        /*
         * Blas comparaison
         */
        {
            char *blastrans = MagmaNoTransStr;
            if ( trans == MagmaConjTrans )
                blastrans = MagmaConjTransStr;
            else if ( trans == MagmaTrans )
                blastrans = MagmaTransStr;
            
            blasf77_zcopy( &Ym, Y, &incy, Ycublas, &incy );
            blasf77_zgemv( blastrans, &M, &N, 
                           &alpha, A,       &lda, 
                                   X,       &incx, 
                           &beta,  Ycublas, &incy );
            
            blasf77_zaxpy( &Ym, &c_neg_one, Ymagma, &incy, Ycublas, &incy);
            error = lapackf77_zlange( "M", &Ym, &ione, Ycublas, &Ym, work );
        }
#endif

        printf(      "\t\t %8.6e\n", error / (double)Ym );
        fprintf( fp, "\t\t %8.6e\n", error / (double)Ym );

    }
    
    /* Free Memory */
    TESTING_FREE( A );
    TESTING_FREE( X );
    TESTING_FREE( Y );
    TESTING_FREE( Ycublas );
    TESTING_FREE( Ymagma );

    TESTING_DEVFREE( dA );
    TESTING_DEVFREE( dX );
    TESTING_DEVFREE( dY );

    /* Free device */
    TESTING_CUDA_FINALIZE();
    return EXIT_SUCCESS;
}
Beispiel #2
0
void lanczos(complex double * A, 	// chunk of A
		complex double * evecs, //the eigenvectors
		double * evals,		//evals, real
		int n, 			// full size of A
		int m,			// rows of A for this process
		int myOffset,			// where to begin			
		int subSize,			// the subspace size
		int commSize,			// MPI size
		int commRank){			// MPI rank


	MPI_Errhandler_set(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
	// args for gemv
	char type = 'N';
	int info,inc=1,dim=n;



#ifdef _USE_GPU
	// check the device
	char hostname[256];
	gethostname(hostname,255);

	struct cudaDeviceProp p;
	cudaGetDeviceProperties(&p,0);
	int support = p.canMapHostMemory;

	if(support == 0){
		fprintf(stderr,"%s does not support mapping host memory\n",hostname);
		MPI_Finalize();
		exit(1);
	}

#endif

	// malloc vectors for use in lanczos
	complex double * alpha	= (complex double*) malloc(sizeof(complex double) * subSize);
	complex double * beta	= (complex double*) malloc(sizeof(complex double) * (subSize-1));
	complex double * r ;

	r = (complex double*) malloc(sizeof(complex double) * n);

	complex double * scratch= (complex double*) malloc(sizeof(complex double) * n);
	complex double * Q	= (complex double*) malloc(sizeof(complex double) * n * subSize);

	for (int i=0; i<m*n; i++)
		Q[i] = 0.0+0.0*_Complex_I;


	// an initial q-vector in first column of Q
	for (int i=0; i<n; i++)
		Q[i] = (1.0+1.0*_Complex_I) / sqrt(2.0f* (double) n);


	//dump_mat("Q",Q);

#ifdef _USE_GPU

	cudaError_t cerror;
	cublasStatus_t status = cublasInit();
	check_cu_error("CUBLAS initialization error on host");

	cuDoubleComplex * d_ortho;
	cuDoubleComplex * d_r;
	cuDoubleComplex * d_A;
	cuDoubleComplex * d_Q;
	cuDoubleComplex * d_beta;
	cuDoubleComplex * d_alpha;
	cuDoubleComplex * d_output;

	// zero copy memory for vector r, for use with MPI
	cerror = cudaHostAlloc((void**) &r,sizeof(cuDoubleComplex)*n,cudaHostAllocMapped);
	check_cu_error("cudaHostAlloc failed for r on host");
	cerror = cudaHostGetDevicePointer(&d_r,r,0);
	check_cu_error("cudaHostGetDevicePointer failed for d_r on host");
	// regular mallocs for everyone else
	cerror = cudaMalloc((void**) &d_ortho, sizeof(cuDoubleComplex));
	check_cu_error("cudaMalloc failed for d_ortho on host");
	cerror = cudaMalloc((void**) &d_alpha, sizeof(cuDoubleComplex) * subSize);
	check_cu_error("cudaMalloc failed for d_alpha on host");
	cerror = cudaMalloc((void**) &d_beta, sizeof(cuDoubleComplex) * (subSize-1));
	check_cu_error("cudaMalloc failed for d_beta on host");

	cerror = cudaMalloc((void**) &d_Q, sizeof(cuDoubleComplex) * subSize*n);
	check_cu_error("cudaMalloc failed for d_Q on host");
	cerror = cudaMalloc((void**) &d_A, sizeof(cuDoubleComplex) * m * n);
	check_cu_error("cudaMalloc failed for d_A on host");
	cerror = cudaMalloc((void**) &d_output, sizeof(cuDoubleComplex) * n);
	check_cu_error("cudaMalloc failed for d_output on host");
	// gpu running configuration
	cublasHandle_t handle;
	cublasCreate(&handle);

	dim3 threads,blocks;
	threads.x 	= _LAN_THREADS;
	blocks.x 	= n / threads.x +1;

	threads.y=1,threads.z=1,blocks.y=1,blocks.z	= 1;

#endif

	// multiplicative factors in gemv
	complex double mula 	= 1.0+0.0*_Complex_I;
	complex double mulb 	= 0.0+0.0*_Complex_I;
	complex double mulc 	= -1.0+0.0*_Complex_I;

	// args for gemv
	//char type = 'N';
	//int m=m,n=n,info;
	//int inc=1,dim=n;


	// init vectors
	zgemv_(&type,&m,&n,&mula,A,&m,Q,&inc,&mulb,&r[myOffset],&inc);


	// need to gather into r
	int success = MPI_Allgather((void*) &r[myOffset], m, MPI_LONG_DOUBLE, \
			(void*) r, m, MPI_LONG_DOUBLE,MPI_COMM_WORLD);

	//dump_vec(commRank,"r",r);


#ifdef _DEBUG_LANCZOS
	if (success != MPI_SUCCESS) {

		char error_string[256];
		int length_of_error_string;

		MPI_Error_string(success, error_string, &length_of_error_string);
		fprintf(stderr,"MPI_Allgather failed in file %s around line %d with code : %s\n",__FILE__,__LINE__,error_string);
		MPI_Finalize();
		exit(1);
	}

#endif
	for (int i=0; i<subSize; i++) alpha[i] 	= 0.0f;
	for (int i=0; i<subSize-1; i++) beta[i] = 0.0f;

	for (int i=0; i<n; i++) alpha[0] 	+= r[i] * conj(Q[i]);
	for (int i=0; i<n; i++) r[i] 		-= alpha[0] * Q[i];
	for (int i=0; i<n; i++) beta[0]		+= conj(r[i]) * r[i];	
	beta[0] = sqrt(beta[0]);

	//test subsequent lanczos vectors
	double ortho;

#ifdef _USE_GPU

	// send to device
	status =cublasSetVector(subSize,sizeof(cuDoubleComplex),alpha,1.0,d_alpha,1.0);
	check_last_cublas_error(status,"cublasSetVector failed for d_alpha on host",hostname,__LINE__);
	status =cublasSetVector(subSize-1,sizeof(cuDoubleComplex),beta,1.0,d_beta,1.0);
	check_cb_error("cublasSetVector failed for d_beta on host");
	status = cublasSetMatrix(m,n,sizeof(cuDoubleComplex),A,m,d_A,m);
	check_cb_error("cublasSetMatrix failed for d_A on host");
	status = cublasSetMatrix(n,subSize,sizeof(cuDoubleComplex),Q,n,d_Q,n);
	check_cb_error("cublasSetMatrix failed for d_Q on host");
#endif


#ifdef _GATHER_SCALAR
	//reduction not currently supported for cuda
	complex double * alpha_temp = (complex double * ) malloc (sizeof(complex double) * commSize);
	complex double * beta_temp = (complex double * ) malloc (sizeof(complex double) * commSize);

#endif
	// main lanczos loops
	for (int i=1; i<subSize; i++){

		MPI_Barrier(MPI_COMM_WORLD);
		ortho = 0.0;

#ifndef _USE_GPU


		// new column to Q, updated q
		
		for (int j=0; j<n; j++) Q[i*n+j] = r[j] / beta[i-1];

		// update r 
		zgemv_(&type,&m,&n,&mula,A,&m,&Q[i*n],&inc,&mulb,&r[myOffset],&inc);

		lanczos_diagnostic_c(r,Q,beta,alpha,n,i);

#ifndef _GATHER_SCALAR
		// need to gather into r
		int success = MPI_Allgather((void*) &r[myOffset], m, MPI_LONG_DOUBLE, \
				(void*) r, m, MPI_LONG_DOUBLE,MPI_COMM_WORLD);


#ifdef _DEBUG_LANCZOS
		if (success != MPI_SUCCESS) {

			char error_string[256];
			int length_of_error_string;

			MPI_Error_string(success, error_string, &length_of_error_string);
			fprintf(stderr,"MPI_Allgather failed in file %s around line %d with code : %s\n",__FILE__,__LINE__,error_string);
			MPI_Finalize();
			exit(1);
		}

#endif

#endif
		//
		int ind = (commSize==1) ? i-1 : i;

		// another r update
		for (int j=0; j<n; j++) r[j] 	-= beta[ind] * Q[(i-1)*n+j];


#ifndef _GATHER_SCALAR
		// update alpha
		for (int j=0; j<n; j++) alpha[i]+= r[j] * conj(Q[i*n+j]);

#else
		alpha_temp[commRank]=0.0+0.0*I;
		for (int j=0; j<m; j++) alpha_temp[commRank] +=r[j+myOffset] * conj(Q[i*n+j+myOffset]);
		// need to gather into r
		int success = MPI_Allgather((void*) &alpha_temp[commRank], 1, MPI_LONG_DOUBLE, \
				(void*) alpha_temp, commSize-1, MPI_LONG_DOUBLE,MPI_COMM_WORLD);

		for (int j=0; j<commSize; j++) alpha[i]+=alpha_temp[j];


#endif

		// r update
		for (int j=0; j<n; j++) r[j] 	-= alpha[i] * Q[i*n+j];

		// weak orthogonality test
		for (int j=0; j<n; j++)	ortho 	+= fabs(conj(Q[j]) * Q[i*n+j]);



		//exit(0);
		// re-orthogonalize
		// r -= Q(Q^T * r)
		if ( ortho > _EVECS_NORM){

#ifdef _GATHER_SCALAR
			// need to gather into r
			int success = MPI_Allgather((void*) &r[myOffset], m, MPI_LONG_DOUBLE, \
					(void*) r, m, MPI_LONG_DOUBLE,MPI_COMM_WORLD);


#ifdef _DEBUG_LANCZOS
			if (success != MPI_SUCCESS) {

				char error_string[256];
				int length_of_error_string;

				MPI_Error_string(success, error_string, &length_of_error_string);
				fprintf(stderr,"MPI_Allgather failed in file %s around line %d with code : %s\n",__FILE__,__LINE__,error_string);
				MPI_Finalize();
				exit(1);
			}

#endif

#endif

			//if (1){

			char typet = 'C';
			zgemv_(&typet,&n,&subSize,&mula,Q,&dim,r,&inc,&mulb,scratch,&inc);
			zgemv_(&type,&n,&subSize,&mulc,Q,&dim,scratch,&inc,&mula,r,&inc);


		}

		// update beta
		if (i<subSize-1){

#ifndef _GATHER_SCALAR

			for (int j=0; j<n; j++) beta[i]	+= conj(r[j]) * r[j];	

#else

			beta_temp[commRank]=0.0+0.0*I;
			for (int j=0; j<m; j++) beta_temp[commRank] +=conj(r[j+myOffset]) * r[j+myOffset];
			int success = MPI_Allgather((void*) &beta_temp[commRank], 1, MPI_LONG_DOUBLE, \
					(void*) beta_temp, commSize-1, MPI_LONG_DOUBLE,MPI_COMM_WORLD);

			for (int j=0; j<commSize; j++) beta[i]+=beta_temp[j];


#endif
			beta[i] = sqrt(beta[i]);
		}

#else

		//lanczos_diagnostic(blocks,threads,d_r,d_Q,d_beta,d_alpha,n,i);
		cerror = lanczos_first_update(blocks, threads, d_r, d_Q, d_beta, n, i);
		check_cu_error("lanczos_first_update failed on host");

		//exit(0);
		cublasGetError();


		cublasZgemv(handle,CUBLAS_OP_N,m,n,&mula,d_A,m,&d_Q[i*n],1,&mulb,&d_r[myOffset],1);

		status = cublasGetError();
		check_cb_error("cublasZgemv failed on host");

		// need to gather into r
		int success = MPI_Allgather((void*) &d_r[myOffset], m, MPI_LONG_DOUBLE, (void*) d_r, m, MPI_LONG_DOUBLE,MPI_COMM_WORLD);


#ifdef _DEBUG_LANCZOS
		if (success != MPI_SUCCESS) {

			char error_string[256];
			int length_of_error_string;

			MPI_Error_string(success, error_string, &length_of_error_string);
			fprintf(stderr,"gpu MPI_Allgather failed in file %s around line %d with code %s\n",__FILE__,__LINE__,error_string);
			MPI_Finalize();
			exit(1);
		}

#endif


		int ind = i; //(commSize==1) ? i-1 : i;
		cerror = lanczos_second_update(blocks, threads, d_r, d_Q, d_beta, n, i, ind);
		check_cu_error("lanczos_second_update failed on host");

		cerror = vector_dot(d_Q,d_r,d_output,&d_alpha[i],1,n,i*n,0,0);
		check_cu_error("vector_dot failed on host");

		cerror = lanczos_third_update(blocks, threads, d_r, d_Q, d_alpha, n, i);
		check_cu_error("lanczos_third_update failed on host");

		if (i<subSize-1){
			cerror = vector_dot(d_r,d_r,d_output,&d_beta[i],1,n,0,0,1);
		}

		check_cu_error("vector_dot failed on host");



		// crude orthogonality test
		//
		cerror = vector_dot(d_Q,d_Q,d_output,d_ortho,1,n,0,i*n,1);
		check_cu_error("vector_dot failed on host");

		//lanczos_diagnostic(blocks,threads,d_r,d_Q,d_beta,d_alpha,n,i);

		cudaMemcpy(&ortho,&d_ortho,sizeof(double), cudaMemcpyDeviceToHost);


		if (fabs(ortho) > _EVECS_NORM){
			//if (0){


			cublasGetError();

			cublasZgemv(handle,CUBLAS_OP_T,n,subSize,&mula,d_Q,dim,d_r,1,&mulb,d_output,1);
			cublasZgemv(handle,CUBLAS_OP_N,n,subSize,&mula,d_Q,dim,d_output,1,&mulb,d_output,1);

			status = cublasGetError();
			check_cb_error("cublasZgemv failed on host");

			cerror = lanczos_fourth_update(blocks, threads, d_r, d_output, n);
			check_cu_error("lanczos_fourth_update failed on host");
		}



#endif
		}

#ifdef _USE_GPU

		if (commRank==0){

			cerror = cudaMemcpy(alpha,d_alpha,sizeof(cuDoubleComplex) * subSize, cudaMemcpyDeviceToHost);
			check_cu_error("cudaMemcpy of d_alpha to host");
			cerror = cudaMemcpy(beta,d_beta,sizeof(cuDoubleComplex) * (subSize-1), cudaMemcpyDeviceToHost);
			check_cu_error("cudaMemcpy of d_beta to host");
			cerror = cudaMemcpy(Q,d_Q,sizeof(cuDoubleComplex) * subSize*n, cudaMemcpyDeviceToHost);
			check_cu_error("cudaMemcpy of d_Q to host");

		}
		cudaFree(d_alpha);
		cudaFree(d_output);
		cudaFree(d_beta);
		cudaFree(d_Q);
		cudaFreeHost(d_r);
		cudaFree(d_A);

#endif


#ifdef _DEBUG_LANCZOS
		if (commRank==0){

			printf("alpha & beta :\n");
			for (int i=0; i<subSize; i++)
				printf("%f+%fi ",creal(alpha[i]),cimag(alpha[i]));
			printf("\n");
			for (int i=0; i<subSize-1; i++)
				printf("%f+%fi ",creal(beta[i]),cimag(beta[i]));
			printf("\n");
		}
#endif
		// calculate spectrum of (now) tridiagonal matrix

		double * alp = (double*) malloc(sizeof(double) * subSize);
		double * bet = (double*) malloc(sizeof(double) * (subSize-1));

		for (int i=0; i<subSize; i++) alp[i] = creal(alpha[i]);
		for (int i=0; i<(subSize-1); i++) bet[i] = creal(beta[i]);

#ifdef _CALC_EVECS

		complex double * evecs_lan = (complex double*) malloc(sizeof(complex double) * subSize * subSize);


		type = 'I';

		zsteqr_(&type,&subSize,alp,bet,evecs_lan,&subSize,(double*) evecs,&info);

		type = 'N';

		for (int i=0; i<subSize; i++)
			zgemv_(&type,&n,&subSize,&mula,Q,&n,&evecs_lan[i*subSize],&inc,&mulb,&evecs[i*n],&inc);

		free(evecs_lan);
#else

		dsterf_(&subSize,alp,bet,&info);
		free(bet);

#endif

		for (int i=0; i<subSize; i++) evals[i] = alp[i];

#ifdef _DEBUG_LANCZOS

		if (commRank==0){
			printf("evals :\n");

			for (int i=0; i<subSize; i++)
				printf("%f ",evals[i]);
			printf("\n");

		}
#endif


		free(alp); 
		free(alpha); 	
		free(beta);
#ifndef _USE_GPU
		free(r);
#endif
		free(Q);
		}
Beispiel #3
0
int main(int argc, char **argv)
{
    TESTING_INIT();

    real_Double_t   gflops, magma_perf, magma_time, dev_perf, dev_time, cpu_perf, cpu_time;
    double          magma_error, dev_error, work[1];
    magma_int_t ione     = 1;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t M, N, Xm, Ym, lda, sizeA, sizeX, sizeY;
    magma_int_t incx = 1;
    magma_int_t incy = 1;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magmaDoubleComplex alpha = MAGMA_Z_MAKE(  1.5, -2.3 );
    magmaDoubleComplex beta  = MAGMA_Z_MAKE( -0.6,  0.8 );
    magmaDoubleComplex *A, *X, *Y, *Ydev, *Ymagma;
    magmaDoubleComplex_ptr dA, dX, dY;
    magma_int_t status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    double tol = opts.tolerance * lapackf77_dlamch("E");

    printf("trans = %s\n", lapack_trans_const(opts.transA) );
    #ifdef HAVE_CUBLAS
        printf("    M     N   MAGMA Gflop/s (ms)  %s Gflop/s (ms)   CPU Gflop/s (ms)  MAGMA error  %s error\n",
                g_platform_str, g_platform_str );
    #else
        printf("    M     N   %s Gflop/s (ms)   CPU Gflop/s (ms)  %s error\n",
                g_platform_str, g_platform_str );
    #endif
    printf("===================================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            lda    = ((M+31)/32)*32;
            gflops = FLOPS_ZGEMV( M, N ) / 1e9;

            if ( opts.transA == MagmaNoTrans ) {
                Xm = N;
                Ym = M;
            } else {
                Xm = M;
                Ym = N;
            }

            sizeA = lda*N;
            sizeX = incx*Xm;
            sizeY = incy*Ym;
            
            TESTING_MALLOC_CPU( A,       magmaDoubleComplex, sizeA );
            TESTING_MALLOC_CPU( X,       magmaDoubleComplex, sizeX );
            TESTING_MALLOC_CPU( Y,       magmaDoubleComplex, sizeY );
            TESTING_MALLOC_CPU( Ydev,    magmaDoubleComplex, sizeY );
            TESTING_MALLOC_CPU( Ymagma,  magmaDoubleComplex, sizeY );
            
            TESTING_MALLOC_DEV( dA, magmaDoubleComplex, sizeA );
            TESTING_MALLOC_DEV( dX, magmaDoubleComplex, sizeX );
            TESTING_MALLOC_DEV( dY, magmaDoubleComplex, sizeY );
            
            /* Initialize the matrix */
            lapackf77_zlarnv( &ione, ISEED, &sizeA, A );
            lapackf77_zlarnv( &ione, ISEED, &sizeX, X );
            lapackf77_zlarnv( &ione, ISEED, &sizeY, Y );
            
            /* =====================================================================
               Performs operation using CUBLAS
               =================================================================== */
            magma_zsetmatrix( M, N, A, lda, dA, 0, lda, opts.queue );
            magma_zsetvector( Xm, X, incx, dX, 0, incx, opts.queue );
            magma_zsetvector( Ym, Y, incy, dY, 0, incy, opts.queue );
            
            #ifdef HAVE_CUBLAS
                dev_time = magma_sync_wtime( 0 );
                cublasZgemv( opts.handle, cublas_trans_const(opts.transA),
                             M, N, &alpha, dA, lda, dX, incx, &beta, dY, incy );
                dev_time = magma_sync_wtime( 0 ) - dev_time;
            #else
                dev_time = magma_sync_wtime( opts.queue );
                magma_zgemv( opts.transA, M, N,
                             alpha, dA, 0, lda,
                                    dX, 0, incx,
                             beta,  dY, 0, incy, opts.queue );
                dev_time = magma_sync_wtime( opts.queue ) - dev_time;
            #endif
            dev_perf = gflops / dev_time;
            
            magma_zgetvector( Ym, dY, 0, incy, Ydev, incy, opts.queue );
            
            /* =====================================================================
               Performs operation using MAGMABLAS (currently only with CUDA)
               =================================================================== */
            #ifdef HAVE_CUBLAS
                magma_zsetvector( Ym, Y, incy, dY, incy );
                
                magma_time = magma_sync_wtime( 0 );
                magmablas_zgemv( opts.transA, M, N, alpha, dA, lda, dX, incx, beta, dY, incy );
                magma_time = magma_sync_wtime( 0 ) - magma_time;
                magma_perf = gflops / magma_time;
                
                magma_zgetvector( Ym, dY, incy, Ymagma, incy );
            #endif
            
            /* =====================================================================
               Performs operation using CPU BLAS
               =================================================================== */
            cpu_time = magma_wtime();
            blasf77_zgemv( lapack_trans_const(opts.transA), &M, &N,
                           &alpha, A, &lda,
                                   X, &incx,
                           &beta,  Y, &incy );
            cpu_time = magma_wtime() - cpu_time;
            cpu_perf = gflops / cpu_time;
            
            /* =====================================================================
               Check the result
               =================================================================== */
            double Anorm = lapackf77_zlange( "F", &M, &N, A, &lda, work );
            double Xnorm = lapackf77_zlange( "F", &Xm, &ione, X, &Xm, work );
            
            blasf77_zaxpy( &Ym, &c_neg_one, Y, &incy, Ydev, &incy );
            dev_error = lapackf77_zlange( "F", &Ym, &ione, Ydev, &Ym, work ) / (Anorm * Xnorm);
            
            #ifdef HAVE_CUBLAS
                blasf77_zaxpy( &Ym, &c_neg_one, Y, &incy, Ymagma, &incy );
                magma_error = lapackf77_zlange( "F", &Ym, &ione, Ymagma, &Ym, work ) / (Anorm * Xnorm);
                
                printf("%5d %5d   %7.2f (%7.2f)    %7.2f (%7.2f)   %7.2f (%7.2f)    %8.2e     %8.2e   %s\n",
                       (int) M, (int) N,
                       magma_perf,  1000.*magma_time,
                       dev_perf,    1000.*dev_time,
                       cpu_perf,    1000.*cpu_time,
                       magma_error, dev_error,
                       (magma_error < tol && dev_error < tol ? "ok" : "failed"));
                status += ! (magma_error < tol && dev_error < tol);
            #else
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)    %8.2e   %s\n",
                       (int) M, (int) N,
                       dev_perf,    1000.*dev_time,
                       cpu_perf,    1000.*cpu_time,
                       dev_error,
                       (dev_error < tol ? "ok" : "failed"));
                status += ! (dev_error < tol);
            #endif
            
            TESTING_FREE_CPU( A );
            TESTING_FREE_CPU( X );
            TESTING_FREE_CPU( Y );
            TESTING_FREE_CPU( Ydev    );
            TESTING_FREE_CPU( Ymagma  );
            
            TESTING_FREE_DEV( dA );
            TESTING_FREE_DEV( dX );
            TESTING_FREE_DEV( dY );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }
    
    TESTING_FINALIZE();
    return status;
}
Beispiel #4
0
int main( int argc, char** argv )
{
    TESTING_INIT();
    
    real_Double_t   gflops, t1, t2;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
    magma_int_t ione = 1;
    const char trans[] = { 'N', 'C', 'T' };
    const char uplo[]  = { 'L', 'U' };
    const char diag[]  = { 'U', 'N' };
    const char side[]  = { 'L', 'R' };
    
    magmaDoubleComplex  *A,  *B,  *C,   *C2, *LU;
    magmaDoubleComplex *dA, *dB, *dC1, *dC2;
    magmaDoubleComplex alpha = MAGMA_Z_MAKE( 0.5, 0.1 );
    magmaDoubleComplex beta  = MAGMA_Z_MAKE( 0.7, 0.2 );
    double dalpha = 0.6;
    double dbeta  = 0.8;
    double work[1], error, total_error;
    magma_int_t ISEED[4] = {0,0,0,1};
    magma_int_t m, n, k, size, maxn, ld, info;
    magma_int_t *piv;
    magma_err_t err;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    printf( "Compares magma wrapper function to cublas function; all diffs should be exactly 0.\n\n" );
    
    total_error = 0.;
    for( int i = 0; i < opts.ntest; ++i ) {
        m = opts.msize[i];
        n = opts.nsize[i];
        k = opts.ksize[i];
        printf("=========================================================================\n");
        printf( "m=%d, n=%d, k=%d\n", (int) m, (int) n, (int) k );
        
        // allocate matrices
        // over-allocate so they can be any combination of {m,n,k} x {m,n,k}.
        maxn = max( max( m, n ), k );
        ld = maxn;
        size = maxn*maxn;
        err = magma_malloc_cpu( (void**) &piv, maxn*sizeof(magma_int_t) );  assert( err == 0 );
        err = magma_zmalloc_pinned( &A,  size );  assert( err == 0 );
        err = magma_zmalloc_pinned( &B,  size );  assert( err == 0 );
        err = magma_zmalloc_pinned( &C,  size );  assert( err == 0 );
        err = magma_zmalloc_pinned( &C2, size );  assert( err == 0 );
        err = magma_zmalloc_pinned( &LU, size );  assert( err == 0 );
        err = magma_zmalloc( &dA,  size );        assert( err == 0 );
        err = magma_zmalloc( &dB,  size );        assert( err == 0 );
        err = magma_zmalloc( &dC1, size );        assert( err == 0 );
        err = magma_zmalloc( &dC2, size );        assert( err == 0 );
        
        // initialize matrices
        size = maxn*maxn;
        lapackf77_zlarnv( &ione, ISEED, &size, A  );
        lapackf77_zlarnv( &ione, ISEED, &size, B  );
        lapackf77_zlarnv( &ione, ISEED, &size, C  );
        
        printf( "========== Level 1 BLAS ==========\n" );
        
        // ----- test ZSWAP
        // swap columns 2 and 3 of dA, then copy to C2 and compare with A
        if ( n >= 3 ) {
            magma_zsetmatrix( m, n, A, ld, dA, ld );
            magma_zsetmatrix( m, n, A, ld, dB, ld );
            magma_zswap( m, dA(0,1), 1, dA(0,2), 1 );
            magma_zswap( m, dB(0,1), 1, dB(0,2), 1 );
            
            // check results, storing diff between magma and cuda calls in C2
            cublasZaxpy( ld*n, c_neg_one, dA, 1, dB, 1 );
            magma_zgetmatrix( m, n, dB, ld, C2, ld );
            error = lapackf77_zlange( "F", &m, &k, C2, &ld, work );
            total_error += error;
            printf( "zswap             diff %.2g\n", error );
        }
        else {
            printf( "zswap skipped for n < 3\n" );
        }
        
        // ----- test IZAMAX
        // get argmax of column of A
        magma_zsetmatrix( m, k, A, ld, dA, ld );
        error = 0;
        for( int j = 0; j < k; ++j ) {
            magma_int_t i1 = magma_izamax( m, dA(0,j), 1 );
            magma_int_t i2 = cublasIzamax( m, dA(0,j), 1 );
            assert( i1 == i2 );
            error += abs( i1 - i2 );
        }
        total_error += error;
        gflops = (double)m * k / 1e9;
        printf( "izamax            diff %.2g\n", error );
        printf( "\n" );
        
        printf( "========== Level 2 BLAS ==========\n" );
        
        // ----- test ZGEMV
        // c = alpha*A*b + beta*c,  with A m*n; b,c m or n-vectors
        // try no-trans/trans
        for( int ia = 0; ia < 3; ++ia ) {
            magma_zsetmatrix( m, n, A,  ld, dA,  ld );
            magma_zsetvector( maxn, B, 1, dB,  1 );
            magma_zsetvector( maxn, C, 1, dC1, 1 );
            magma_zsetvector( maxn, C, 1, dC2, 1 );
            t1 = magma_sync_wtime( 0 );
            magma_zgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasZgemv( trans[ia], m, n, alpha, dA, ld, dB, 1, beta, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            size = (trans[ia] == 'N' ? m : n);
            cublasZaxpy( size, c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetvector( size, dC2, 1, C2, 1 );
            error = lapackf77_zlange( "F", &size, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZGEMV( m, n ) / 1e9;
            printf( "zgemv( %c )        diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    trans[ia], error, gflops/t1, gflops/t2 );
        }
        printf( "\n" );
        
        // ----- test ZHEMV
        // c = alpha*A*b + beta*c,  with A m*m symmetric; b,c m-vectors
        // try upper/lower
        for( int iu = 0; iu < 2; ++iu ) {
            magma_zsetmatrix( m, m, A, ld, dA, ld );
            magma_zsetvector( m, B, 1, dB,  1 );
            magma_zsetvector( m, C, 1, dC1, 1 );
            magma_zsetvector( m, C, 1, dC2, 1 );
            t1 = magma_sync_wtime( 0 );
            magma_zhemv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasZhemv( uplo[iu], m, alpha, dA, ld, dB, 1, beta, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( m, c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetvector( m, dC2, 1, C2, 1 );
            error = lapackf77_zlange( "F", &m, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZHEMV( m ) / 1e9;
            printf( "zhemv( %c )        diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], error, gflops/t1, gflops/t2 );
        }
        printf( "\n" );
        
        // ----- test ZTRSV
        // solve A*c = c,  with A m*m triangular; c m-vector
        // try upper/lower, no-trans/trans, unit/non-unit diag
        // Factor A into LU to get well-conditioned triangles, else solve yields garbage.
        // Still can give garbage if solves aren't consistent with LU factors,
        // e.g., using unit diag for U, so copy lower triangle to upper triangle.
        // Also used for trsm later.
        lapackf77_zlacpy( "Full", &maxn, &maxn, A, &ld, LU, &ld );
        lapackf77_zgetrf( &maxn, &maxn, LU, &ld, piv, &info );
        for( int j = 0; j < maxn; ++j ) {
            for( int i = 0; i < j; ++i ) {
                *LU(i,j) = *LU(j,i);
            }
        }
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            magma_zsetmatrix( m, m, LU, ld, dA, ld );
            magma_zsetvector( m, C, 1, dC1, 1 );
            magma_zsetvector( m, C, 1, dC2, 1 );
            t1 = magma_sync_wtime( 0 );
            magma_ztrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC1, 1 );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasZtrsv( uplo[iu], trans[it], diag[id], m, dA, ld, dC2, 1 );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( m, c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetvector( m, dC2, 1, C2, 1 );
            error = lapackf77_zlange( "F", &m, &ione, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZTRSM( MagmaLeft, m, 1 ) / 1e9;
            printf( "ztrsv( %c, %c, %c )  diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], diag[id], error, gflops/t1, gflops/t2 );
        }}}
        printf( "\n" );
        
        printf( "========== Level 3 BLAS ==========\n" );
        
        // ----- test ZGEMM
        // C = alpha*A*B + beta*C,  with A m*k or k*m; B k*n or n*k; C m*n
        // try combinations of no-trans/trans
        for( int ia = 0; ia < 3; ++ia ) {
        for( int ib = 0; ib < 3; ++ib ) {
            bool nta = (trans[ia] == 'N');
            bool ntb = (trans[ib] == 'N');
            magma_zsetmatrix( (nta ? m : k), (nta ? m : k), A, ld, dA,  ld );
            magma_zsetmatrix( (ntb ? k : n), (ntb ? n : k), B, ld, dB,  ld );
            magma_zsetmatrix( m, n, C, ld, dC1, ld );
            magma_zsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_zgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasZgemm( trans[ia], trans[ib], m, n, k, alpha, dA, ld, dB, ld, beta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &m, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZGEMM( m, n, k ) / 1e9;
            printf( "zgemm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    trans[ia], trans[ib], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test ZHEMM
        // C = alpha*A*B + beta*C  (left)  with A m*m symmetric; B,C m*n; or
        // C = alpha*B*A + beta*C  (right) with A n*n symmetric; B,C m*n
        // try left/right, upper/lower
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
            magma_zsetmatrix( m, m, A, ld, dA,  ld );
            magma_zsetmatrix( m, n, B, ld, dB,  ld );
            magma_zsetmatrix( m, n, C, ld, dC1, ld );
            magma_zsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_zhemm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasZhemm( side[is], uplo[iu], m, n, alpha, dA, ld, dB, ld, beta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &m, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZHEMM( side[is], m, n ) / 1e9;
            printf( "zhemm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    side[is], uplo[iu], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test ZHERK
        // C = alpha*A*A^H + beta*C  (no-trans) with A m*k and C m*m symmetric; or
        // C = alpha*A^H*A + beta*C  (trans)    with A k*m and C m*m symmetric
        // try upper/lower, no-trans/trans
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
            magma_zsetmatrix( n, k, A, ld, dA,  ld );
            magma_zsetmatrix( n, n, C, ld, dC1, ld );
            magma_zsetmatrix( n, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_zherk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasZherk( uplo[iu], trans[it], n, k, dalpha, dA, ld, dbeta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( n, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZHERK( k, n ) / 1e9;
            printf( "zherk( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test ZHER2K
        // C = alpha*A*B^H + ^alpha*B*A^H + beta*C  (no-trans) with A,B n*k; C n*n symmetric; or
        // C = alpha*A^H*B + ^alpha*B^H*A + beta*C  (trans)    with A,B k*n; C n*n symmetric
        // try upper/lower, no-trans/trans
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
            bool nt = (trans[it] == 'N');
            magma_zsetmatrix( (nt ? n : k), (nt ? n : k), A, ld, dA,  ld );
            magma_zsetmatrix( n, n, C, ld, dC1, ld );
            magma_zsetmatrix( n, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_zher2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasZher2k( uplo[iu], trans[it], n, k, alpha, dA, ld, dB, ld, dbeta, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( n, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZHER2K( k, n ) / 1e9;
            printf( "zher2k( %c, %c )    diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}
        printf( "\n" );
        
        // ----- test ZTRMM
        // C = alpha*A*C  (left)  with A m*m triangular; C m*n; or
        // C = alpha*C*A  (right) with A n*n triangular; C m*n
        // try left/right, upper/lower, no-trans/trans, unit/non-unit
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            bool left = (side[is] == 'L');
            magma_zsetmatrix( (left ? m : n), (left ? m : n), A, ld, dA,  ld );
            magma_zsetmatrix( m, n, C, ld, dC1, ld );
            magma_zsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_ztrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasZtrmm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZTRMM( side[is], m, n ) / 1e9;
            printf( "ztrmm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}}}
        printf( "\n" );
        
        // ----- test ZTRSM
        // solve A*X = alpha*B  (left)  with A m*m triangular; B m*n; or
        // solve X*A = alpha*B  (right) with A n*n triangular; B m*n
        // try left/right, upper/lower, no-trans/trans, unit/non-unit
        for( int is = 0; is < 2; ++is ) {
        for( int iu = 0; iu < 2; ++iu ) {
        for( int it = 0; it < 3; ++it ) {
        for( int id = 0; id < 2; ++id ) {
            bool left = (side[is] == 'L');
            magma_zsetmatrix( (left ? m : n), (left ? m : n), LU, ld, dA,  ld );
            magma_zsetmatrix( m, n, C, ld, dC1, ld );
            magma_zsetmatrix( m, n, C, ld, dC2, ld );
            t1 = magma_sync_wtime( 0 );
            magma_ztrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC1, ld );
            t1 = magma_sync_wtime( 0 ) - t1;
            t2 = magma_sync_wtime( 0 );
            cublasZtrsm( side[is], uplo[iu], trans[it], diag[id], m, n, alpha, dA, ld, dC2, ld );
            t2 = magma_sync_wtime( 0 ) - t2;
            
            // check results, storing diff between magma and cuda call in C2
            cublasZaxpy( ld*n, c_neg_one, dC1, 1, dC2, 1 );
            magma_zgetmatrix( m, n, dC2, ld, C2, ld );
            error = lapackf77_zlange( "F", &n, &n, C2, &ld, work );
            total_error += error;
            gflops = FLOPS_ZTRSM( side[is], m, n ) / 1e9;
            printf( "ztrsm( %c, %c )     diff %.2g,  Gflop/s %6.2f, %6.2f\n",
                    uplo[iu], trans[it], error, gflops/t1, gflops/t2 );
        }}}}
        printf( "\n" );
        
        // cleanup
        magma_free_cpu( piv );
        magma_free_pinned( A  );
        magma_free_pinned( B  );
        magma_free_pinned( C  );
        magma_free_pinned( C2 );
        magma_free_pinned( LU );
        magma_free( dA  );
        magma_free( dB  );
        magma_free( dC1 );
        magma_free( dC2 );
    }
    
    if ( total_error != 0. ) {
        printf( "total error %.2g -- ought to be 0 -- some test failed (see above).\n",
                total_error );
    }
    else {
        printf( "all tests passed\n" );
    }
    
    TESTING_FINALIZE();
    return 0;
}