Esempio n. 1
0
void init_GPU_data(GlobalParams &p) {
    
    int n_known = p.nRegions;
    int n_total = p.pred_dist.n_rows;
    int n_pred = n_total-n_known;
    
    scatr_magma_init();

    arma::mat dist12 = p.pred_dist(arma::span(0,n_known-1),        arma::span(n_known, n_total-1));
    arma::mat dist22 = p.pred_dist(arma::span(n_known, n_total-1), arma::span(n_known, n_total-1));
    
    checkCublasError( cublasCreate_v2(&p.handle), "handle (Create)" );
    
    cudaMalloc((void**) &p.d_dist12, n_known*n_pred*sizeof(double));  checkCudaError("dist12 (Malloc)");
    checkCublasError( 
        cublasSetMatrix(n_known, n_pred, sizeof(double), dist12.memptr(), n_known, p.d_dist12, n_known), 
        "dist12 (Set)" 
    );
    
    cudaMalloc((void**) &p.d_dist22, n_pred*n_pred * sizeof(double));   checkCudaError("dist22 (Malloc)");
    checkCublasError( 
        cublasSetMatrix(n_pred, n_pred, sizeof(double), dist22.memptr(), n_pred, p.d_dist22, n_pred), 
        "dist22 (Set)"
    );
    
    cudaMalloc((void**) &p.d_cov12,    n_known * n_pred  * sizeof(double)); checkCudaError("cov12 (Malloc)");
    cudaMalloc((void**) &p.d_cov22,    n_pred  * n_pred  * sizeof(double)); checkCudaError("cov22 (Malloc)");
    cudaMalloc((void**) &p.d_invcov11, n_known * n_known * sizeof(double)); checkCudaError("invcov11 (Malloc)");
    cudaMalloc((void**) &p.d_tmp,      n_pred  * n_known * sizeof(double)); checkCudaError("tmp (Malloc)");
}
Esempio n. 2
0
CAMLprim value spoc_cublasSetMatrix (value rows, value cols, value a, value lda, value b, value ldb, value dev){
	CAMLparam5(rows, cols, a, lda, b);
	CAMLxparam2(ldb, dev);
	CAMLlocal4(dev_vec_array, dev_vec, gi, bigArray);
	CUdeviceptr d_B;
	void* h_A;
	int type_size = sizeof(double);
	int tag;
	int id;
	gi = Field(dev, 0);
	id = Int_val(Field(gi, 7));
	GET_VEC(b, d_B);
	GET_HOST_VEC (a, h_A);

	CUBLAS_GET_CONTEXT;
	int custom = 0;
	GET_TYPE_SIZE;

	//printf("rows : %d, col: %d, type_size : %d, lda :%d, ldb : %d\n", Int_val(rows), Int_val(cols), type_size, Int_val (lda), Int_val(ldb));
	//fflush(stdout);
	CUBLAS_CHECK_CALL(cublasSetMatrix(Int_val(rows), Int_val(cols), type_size, h_A, Int_val(lda), (void*) d_B, Int_val(ldb)));
	
	CUBLAS_RESTORE_CONTEXT;
	CAMLreturn(Val_unit);
}
Esempio n. 3
0
void CuBlasMatrix::updateDeviceData()
{
    cublasStatus_t status = cublasSetMatrix(rows, cols, sizeof(cuDoubleComplex), hostRaw, rows, deviceRaw, rows);
    if(status != CUBLAS_STATUS_SUCCESS)
    {
        qFatal("Data upload faild");
    }
    hostDataChanged = false;
}
Esempio n. 4
0
uint micronn_matrix_set_vals(micronn_matrix* w, float* vals)
{
    cublasStatus_t stat;
    stat = cublasSetMatrix(w->rows, w->cols, sizeof(float), vals, w->rows, w->devPtrvals, w->rows);
    if(stat != CUBLAS_STATUS_SUCCESS) {
        fprintf(stderr, "data upload failed\n");
        return 0;
    }
    return 1;
};
Esempio n. 5
0
File: ardblas.c Progetto: rforge/gcb
SEXP d_setMatrix(SEXP m)
{
	int
		rows = nrows(m), cols = ncols(m);
	double * d_m;
	
	cublasAlloc(rows * cols, sizeof(double), (void **)&d_m);
	cublasSetMatrix(rows, cols, sizeof(double), REAL(m), rows, d_m, rows);
	checkCublasError("d_setMatrix");
	
	return packMatrix(rows, cols, d_m);
}
Esempio n. 6
0
void magma_setmatrix(
    magma_int_t m, magma_int_t n, size_t elemSize,
    void const* hA_src, magma_int_t lda,
    void*       dB_dst, magma_int_t ldb )
{
    cublasStatus_t status;
    status = cublasSetMatrix(
        m, n, elemSize,
        hA_src, lda,
        dB_dst, ldb );
    check_error( status );
}
Esempio n. 7
0
void magma_ssetmatrix_internal(
    magma_int_t m, magma_int_t n,
    float const* hA_src, magma_int_t lda,
    float*       dB_dst, magma_int_t ldb,
    const char* func, const char* file, int line )
{
    cublasStatus_t status;
    status = cublasSetMatrix(
        m, n, sizeof(float),
        hA_src, lda,
        dB_dst, ldb );
    check_xerror( status, func, file, line );
}
Esempio n. 8
0
void magma_setmatrix_internal(
    magma_int_t m, magma_int_t n, magma_int_t elemSize,
    void const* hA_src, magma_int_t lda,
    void*       dB_dst, magma_int_t ldb,
    const char* func, const char* file, int line )
{
    cublasStatus_t status;
    status = cublasSetMatrix(
        m, n, elemSize,
        hA_src, lda,
        dB_dst, ldb );
    check_xerror( status, func, file, line );
}
Esempio n. 9
0
void cuda_gemm(const cublasHandle_t handle, const cublasOperation_t transa, const cublasOperation_t transb, int m, int n, int k, const T alpha, const T A[], int lda, const T B[], int ldb, const T beta, T C[], int ldc, GEMM gemm)
{
	T *d_A = NULL;
	cudaMalloc((void**)&d_A, m*k*sizeof(T));
	cublasSetMatrix(m, k, sizeof(T), A, m, d_A, m);

	T *d_B = NULL;
	cudaMalloc((void**)&d_B, k*n*sizeof(T));
	cublasSetMatrix(k, n, sizeof(T), B, k, d_B, k);

	T *d_C = NULL;
	cudaMalloc((void**)&d_C, m*n*sizeof(T));
	cublasSetMatrix(m, n, sizeof(T), C, m, d_C, m);

	gemm(handle, transa, transb, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc);

	cublasGetMatrix(m, n, sizeof(T), d_C, m, C, m);

	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);
}
Esempio n. 10
0
// ========================================
// copying sub-matrices (contiguous columns)
extern "C" void
magma_zsetmatrix_internal(
    magma_int_t m, magma_int_t n,
    magmaDoubleComplex const* hA_src, magma_int_t lda,
    magmaDoubleComplex_ptr    dB_dst, magma_int_t ldb,
    const char* func, const char* file, int line )
{
    cublasStatus_t status;
    status = cublasSetMatrix(
        m, n, sizeof(magmaDoubleComplex),
        hA_src, lda,
        dB_dst, ldb );
    check_xerror( status, func, file, line );
}
Esempio n. 11
0
void mat_prod_mat(const double* a, cublasOperation_t op_a, const double* b, cublasOperation_t op_b, double*c, int m, int n, int k){

	cudaError_t cudaStat ; // cudaMalloc status
	cublasStatus_t stat ; // CUBLAS functions status
	cublasHandle_t handle ; // CUBLAS context

	// on the device
	double* d_a; // d_a - a on the device
	double* d_b; // d_b - b on the device
	double* d_c; // d_c - c on the device
	cudaStat = cudaMalloc((void **)&d_a ,m*k*sizeof(*a)); // device
	// memory alloc for a
	cudaStat = cudaMalloc((void **)&d_b ,k*n*sizeof(*b)); // device
	// memory alloc for b
	cudaStat = cudaMalloc((void **)&d_c ,m*n*sizeof(*c)); // device
	// memory alloc for c
	stat = cublasCreate(&handle); // initialize CUBLAS context
// copy matrices from the host to the device
	stat = cublasSetMatrix (m,k, sizeof(*a) ,a,m,d_a ,m); //a -> d_a
	stat = cublasSetMatrix (k,n, sizeof(*b) ,b,k,d_b ,k); //b -> d_b
	stat = cublasSetMatrix (m,n, sizeof(*c) ,c,m,d_c ,m); //c -> d_c
	double al=1.0;	
	double bet=1.0;
// matrix - matrix multiplication : d_c = al*d_a *d_b + bet *d_c
// d_a -mxk matrix , d_b -kxn matrix , d_c -mxn matrix ;
// al ,bet -scalars
	stat=cublasDgemm(handle,op_a,op_b,m,n,k,&al,d_a,m,d_b,k,&bet,d_c,m);
	
	stat = cublasGetMatrix (m, n, sizeof(*c) ,d_c ,m,c,m); // cp d_c - >c

	cudaFree (d_a ); // free device memory
	cudaFree (d_b ); // free device memory
	cudaFree (d_c ); // free device memory
	cublasDestroy ( handle ); // destroy CUBLAS context

}
Esempio n. 12
0
GPUMatrix::GPUMatrix(const Matrix& hostMatrix) :
    _rows(hostMatrix.rows()),
    _cols(hostMatrix.cols()),
    _data(0)
{
    initDeviceMemory();

    // Copy data to the device.
    cublasStatus_t cublasStat = cublasSetMatrix(
        hostMatrix.rows(), hostMatrix.cols(), 
        sizeof(*hostMatrix._data), 
        hostMatrix._data, hostMatrix.rows(), 
        _data, _rows
    );
    if (cublasStat != CUBLAS_STATUS_SUCCESS) {
        throw std::runtime_error("Data transfer to GPU failed!");
    }
}
Esempio n. 13
0
int double_copyMatrixHost2GPU(PGM_Matriz_Double *host, PGM_Matriz_Double *work, PGM_Matriz_GPU* device){
    if(work->n_colunas == device->max_dim && work->n_linhas == device->max_dim){
        int i,j = 0;
        for(i = 0; i < host->n_colunas; i++){
            for( j = 0; j < host->n_linhas; j++){
                PGM_ELEM_MATRIZ(work,j,i) = PGM_ELEM_MATRIZ(host,j,i);
            }
        }
        for ( ; i < work->n_colunas; i++)
            for ( ;j < work->n_linhas; j++)
                PGM_ELEM_MATRIZ(work,j, i) = 0;

        return cublasSetMatrix(device->max_dim,device->max_dim,sizeof(double),work->valor,work->n_colunas, device->valor, device->max_dim);

    }else{
        return CUBLAS_STATUS_INVALID_VALUE;
    }
}
Esempio n. 14
0
    magma_buffer( size_type h, size_type w, pointer host = 0 )
                : gpu_ptr_(0), height_(h), width_(w)
    {
      void *ptr;
      cudaError_t status = cudaMalloc ( reinterpret_cast<void**>(&ptr)
                                      , h*w*sizeof(value_type)
                                      );

      if(status != cudaSuccess) BOOST_THROW_EXCEPTION( std::bad_alloc() );

      gpu_ptr_ = reinterpret_cast<pointer>(ptr);

      if(host)
      {
        cublasStatus_t e = cublasSetMatrix( h,w,sizeof(value_type)
                                          , host    , h
                                          , gpu_ptr_, h
                                          );
      }
    }
void pzgecopy_hd( F_CHAR_T TRANS, int *m_in, int *n_in,
    double *A, int *ia_in, int *ja_in, int *descA,
    double *dB, int *ib_in, int *jb_in, int *descB )
{
/*
 Copy m by n distributed submatrix  from host to GPU
 */

int m  = *m_in;
int n  = *n_in;

int ia = *ia_in;
int ja = *ja_in;
int ib = *ib_in;
int jb = *jb_in;

int nprow = 0;
int npcol = 0;
int myprow = 0;
int mypcol = 0;
int mmb = 0;
int nnb = 0;

int istart = 0;
int iend = 0;
int isize = 0;

int Locp = 0;
int Locq = 0;
int lld = 0;

int jstart = 0;
int jend = 0;
int jsize = 0;

int iib = 0;
int jjb = 0;

cuDoubleComplex *dBptr = 0;
double *Btmp = 0;
F_CHAR_T NoTrans = "N";

int descBtmp[DLEN_];
int elmSize = sizeof(cuDoubleComplex);

double z_one[2];
double z_zero[2];

/*
 Tuneable parameters
 */
int mfactor = 4;
int nfactor = 4;


int rsrc = 0;
int csrc = 0;
int irsrc = 0;
int jcsrc = 0;
int info = 0;

int notran = 0;
int TrA = 0;

int lrindx = 0;
int lcindx = 0;
int nrow = 0;
int ncol = 0;
int mm = 0;
int nn = 0;
int iia = 0;
int jja = 0;

cublasStatus cu_status;

z_one[REAL_PART] = 1;
z_one[IMAG_PART] = 0;
z_zero[REAL_PART] = 0;
z_zero[IMAG_PART] = 0;

Cblacs_gridinfo( descA[CTXT_], &nprow, &npcol, &myprow, &mypcol );

notran = ( ( TrA = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN );


/*
 * check arguments
 */
if ((m <= 0) || (n <= 0)) {
   return;
   };

assert( (1 <= ia) && ((ia + m-1) <= descA[M_] ) );
assert( (1 <= ja) && ((ja + n-1) <= descA[N_] ) );
assert( (1 <= ib) && ((ib + m-1) <= descB[M_] ) );
assert( (1 <= jb) && ((jb + n-1) <= descB[N_] ) );

/*
 * Create a temp matrix that is aligned to descB.
 * Assume size is   mmb by nnb
 */

if (notran) {
  mmb = MIN( m, descB[MB_] * nprow * mfactor);
  nnb = MIN( n, descB[NB_] * npcol * nfactor);
  }
else {
  nnb = MIN( m, descB[MB_] * nprow * mfactor);
  mmb = MIN( n, descB[NB_] * npcol * nfactor);
  };
  

mmb = MAX( mmb,1);
nnb = MAX( nnb,1);

rsrc = indxg2p_( &ib, &descB[MB_], &myprow, &descB[RSRC_], &nprow);
csrc = indxg2p_( &jb, &descB[NB_], &mypcol, &descB[CSRC_], &npcol);


Locp = numroc_( &mmb, &descB[MB_], &myprow, &rsrc, &nprow );
Locq = numroc_( &nnb, &descB[NB_], &mypcol, &csrc, &npcol );

Btmp = (double*) malloc( MAX(1,(Locp * Locq))*elmSize );
assert( Btmp != 0 );


lld = MAX(1, Locp);
descinit_( descBtmp, &mmb, &nnb, &descB[MB_], &descB[NB_],
            &rsrc, &csrc, &descB[CTXT_], &lld, &info );

assert( info == 0);



for( jstart=ja; jstart <= ja + n-1; jstart = jend + 1) {
  jend = MIN( ja + n -1, jstart + nnb - 1);
  jsize = jend - jstart + 1;

  for( istart=ia; istart <= ia + m-1; istart = iend + 1) {
     iend = MIN( ia + m-1, istart + mmb -1);
     isize = iend - istart + 1;

     iia = ia + (istart-1);
     jja = ja + (jstart-1);


     iib = 1;
     jjb = 1;
     

     if (notran) {
        mm = isize;
        nn = jsize;
        }
    else {
        mm = jsize;
        nn = isize;
        };


     pzgeadd_( TRAN,  &mm, &nn,   z_one, A, &iia, &jja, descA,  
                         z_zero, Btmp, &iib, &jjb, descBtmp );

     /* 
      * find local extent
      */

     
     if (notran) {
       iib = ib + (istart-1);
       jjb = jb + (jstart-1);
       }
     else {
       iib = ib + (jstart-1);
       jjb = jb + (istart-1);
       };

     if (notran) {
       nrow = numroc_( &isize, &descB[MB_], &myprow, &rsrc, &nprow);
       ncol = numroc_( &jsize, &descB[NB_], &mypcol, &csrc, &npcol);
       }
     else {
       nrow = numroc_( &jsize, &descB[MB_], &myprow, &rsrc, &nprow);
       ncol = numroc_( &isize, &descB[NB_], &mypcol, &csrc, &npcol);

     };

     /*
      Perform global
      dB( iib:(iib+isize-1), jjb:(jjb+jsize-1)) <- B(1:isize,1:jsize)

      Perform local
      dB( lrindx:(lrindx+nrow-1), lcindx:(lcindx+ncol-1)) <-
              B(1:nrow, 1:ncol)
      */


     infog2l_( &iib, &jjb, descB, &nprow, &npcol, &myprow, &mypcol,
               &lrindx, &lcindx,  &irsrc, &jcsrc );


     dBptr = (cuDoubleComplex *) dB;
     dBptr = dBptr + INDX2F( lrindx,lcindx, descB[LLD_]);


     cu_status = cublasSetMatrix( nrow,ncol,elmSize,
               (cuDoubleComplex *) Btmp,  descBtmp[LLD_],
                                   dBptr, descB[LLD_] );
     assert( cu_status == CUBLAS_STATUS_SUCCESS );

     };
  };

  free( Btmp );

  return;
}
Esempio n. 16
0
void lanczos(complex double * A, 	// chunk of A
		complex double * evecs, //the eigenvectors
		double * evals,		//evals, real
		int n, 			// full size of A
		int m,			// rows of A for this process
		int myOffset,			// where to begin			
		int subSize,			// the subspace size
		int commSize,			// MPI size
		int commRank){			// MPI rank


	MPI_Errhandler_set(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
	// args for gemv
	char type = 'N';
	int info,inc=1,dim=n;



#ifdef _USE_GPU
	// check the device
	char hostname[256];
	gethostname(hostname,255);

	struct cudaDeviceProp p;
	cudaGetDeviceProperties(&p,0);
	int support = p.canMapHostMemory;

	if(support == 0){
		fprintf(stderr,"%s does not support mapping host memory\n",hostname);
		MPI_Finalize();
		exit(1);
	}

#endif

	// malloc vectors for use in lanczos
	complex double * alpha	= (complex double*) malloc(sizeof(complex double) * subSize);
	complex double * beta	= (complex double*) malloc(sizeof(complex double) * (subSize-1));
	complex double * r ;

	r = (complex double*) malloc(sizeof(complex double) * n);

	complex double * scratch= (complex double*) malloc(sizeof(complex double) * n);
	complex double * Q	= (complex double*) malloc(sizeof(complex double) * n * subSize);

	for (int i=0; i<m*n; i++)
		Q[i] = 0.0+0.0*_Complex_I;


	// an initial q-vector in first column of Q
	for (int i=0; i<n; i++)
		Q[i] = (1.0+1.0*_Complex_I) / sqrt(2.0f* (double) n);


	//dump_mat("Q",Q);

#ifdef _USE_GPU

	cudaError_t cerror;
	cublasStatus_t status = cublasInit();
	check_cu_error("CUBLAS initialization error on host");

	cuDoubleComplex * d_ortho;
	cuDoubleComplex * d_r;
	cuDoubleComplex * d_A;
	cuDoubleComplex * d_Q;
	cuDoubleComplex * d_beta;
	cuDoubleComplex * d_alpha;
	cuDoubleComplex * d_output;

	// zero copy memory for vector r, for use with MPI
	cerror = cudaHostAlloc((void**) &r,sizeof(cuDoubleComplex)*n,cudaHostAllocMapped);
	check_cu_error("cudaHostAlloc failed for r on host");
	cerror = cudaHostGetDevicePointer(&d_r,r,0);
	check_cu_error("cudaHostGetDevicePointer failed for d_r on host");
	// regular mallocs for everyone else
	cerror = cudaMalloc((void**) &d_ortho, sizeof(cuDoubleComplex));
	check_cu_error("cudaMalloc failed for d_ortho on host");
	cerror = cudaMalloc((void**) &d_alpha, sizeof(cuDoubleComplex) * subSize);
	check_cu_error("cudaMalloc failed for d_alpha on host");
	cerror = cudaMalloc((void**) &d_beta, sizeof(cuDoubleComplex) * (subSize-1));
	check_cu_error("cudaMalloc failed for d_beta on host");

	cerror = cudaMalloc((void**) &d_Q, sizeof(cuDoubleComplex) * subSize*n);
	check_cu_error("cudaMalloc failed for d_Q on host");
	cerror = cudaMalloc((void**) &d_A, sizeof(cuDoubleComplex) * m * n);
	check_cu_error("cudaMalloc failed for d_A on host");
	cerror = cudaMalloc((void**) &d_output, sizeof(cuDoubleComplex) * n);
	check_cu_error("cudaMalloc failed for d_output on host");
	// gpu running configuration
	cublasHandle_t handle;
	cublasCreate(&handle);

	dim3 threads,blocks;
	threads.x 	= _LAN_THREADS;
	blocks.x 	= n / threads.x +1;

	threads.y=1,threads.z=1,blocks.y=1,blocks.z	= 1;

#endif

	// multiplicative factors in gemv
	complex double mula 	= 1.0+0.0*_Complex_I;
	complex double mulb 	= 0.0+0.0*_Complex_I;
	complex double mulc 	= -1.0+0.0*_Complex_I;

	// args for gemv
	//char type = 'N';
	//int m=m,n=n,info;
	//int inc=1,dim=n;


	// init vectors
	zgemv_(&type,&m,&n,&mula,A,&m,Q,&inc,&mulb,&r[myOffset],&inc);


	// need to gather into r
	int success = MPI_Allgather((void*) &r[myOffset], m, MPI_LONG_DOUBLE, \
			(void*) r, m, MPI_LONG_DOUBLE,MPI_COMM_WORLD);

	//dump_vec(commRank,"r",r);


#ifdef _DEBUG_LANCZOS
	if (success != MPI_SUCCESS) {

		char error_string[256];
		int length_of_error_string;

		MPI_Error_string(success, error_string, &length_of_error_string);
		fprintf(stderr,"MPI_Allgather failed in file %s around line %d with code : %s\n",__FILE__,__LINE__,error_string);
		MPI_Finalize();
		exit(1);
	}

#endif
	for (int i=0; i<subSize; i++) alpha[i] 	= 0.0f;
	for (int i=0; i<subSize-1; i++) beta[i] = 0.0f;

	for (int i=0; i<n; i++) alpha[0] 	+= r[i] * conj(Q[i]);
	for (int i=0; i<n; i++) r[i] 		-= alpha[0] * Q[i];
	for (int i=0; i<n; i++) beta[0]		+= conj(r[i]) * r[i];	
	beta[0] = sqrt(beta[0]);

	//test subsequent lanczos vectors
	double ortho;

#ifdef _USE_GPU

	// send to device
	status =cublasSetVector(subSize,sizeof(cuDoubleComplex),alpha,1.0,d_alpha,1.0);
	check_last_cublas_error(status,"cublasSetVector failed for d_alpha on host",hostname,__LINE__);
	status =cublasSetVector(subSize-1,sizeof(cuDoubleComplex),beta,1.0,d_beta,1.0);
	check_cb_error("cublasSetVector failed for d_beta on host");
	status = cublasSetMatrix(m,n,sizeof(cuDoubleComplex),A,m,d_A,m);
	check_cb_error("cublasSetMatrix failed for d_A on host");
	status = cublasSetMatrix(n,subSize,sizeof(cuDoubleComplex),Q,n,d_Q,n);
	check_cb_error("cublasSetMatrix failed for d_Q on host");
#endif


#ifdef _GATHER_SCALAR
	//reduction not currently supported for cuda
	complex double * alpha_temp = (complex double * ) malloc (sizeof(complex double) * commSize);
	complex double * beta_temp = (complex double * ) malloc (sizeof(complex double) * commSize);

#endif
	// main lanczos loops
	for (int i=1; i<subSize; i++){

		MPI_Barrier(MPI_COMM_WORLD);
		ortho = 0.0;

#ifndef _USE_GPU


		// new column to Q, updated q
		
		for (int j=0; j<n; j++) Q[i*n+j] = r[j] / beta[i-1];

		// update r 
		zgemv_(&type,&m,&n,&mula,A,&m,&Q[i*n],&inc,&mulb,&r[myOffset],&inc);

		lanczos_diagnostic_c(r,Q,beta,alpha,n,i);

#ifndef _GATHER_SCALAR
		// need to gather into r
		int success = MPI_Allgather((void*) &r[myOffset], m, MPI_LONG_DOUBLE, \
				(void*) r, m, MPI_LONG_DOUBLE,MPI_COMM_WORLD);


#ifdef _DEBUG_LANCZOS
		if (success != MPI_SUCCESS) {

			char error_string[256];
			int length_of_error_string;

			MPI_Error_string(success, error_string, &length_of_error_string);
			fprintf(stderr,"MPI_Allgather failed in file %s around line %d with code : %s\n",__FILE__,__LINE__,error_string);
			MPI_Finalize();
			exit(1);
		}

#endif

#endif
		//
		int ind = (commSize==1) ? i-1 : i;

		// another r update
		for (int j=0; j<n; j++) r[j] 	-= beta[ind] * Q[(i-1)*n+j];


#ifndef _GATHER_SCALAR
		// update alpha
		for (int j=0; j<n; j++) alpha[i]+= r[j] * conj(Q[i*n+j]);

#else
		alpha_temp[commRank]=0.0+0.0*I;
		for (int j=0; j<m; j++) alpha_temp[commRank] +=r[j+myOffset] * conj(Q[i*n+j+myOffset]);
		// need to gather into r
		int success = MPI_Allgather((void*) &alpha_temp[commRank], 1, MPI_LONG_DOUBLE, \
				(void*) alpha_temp, commSize-1, MPI_LONG_DOUBLE,MPI_COMM_WORLD);

		for (int j=0; j<commSize; j++) alpha[i]+=alpha_temp[j];


#endif

		// r update
		for (int j=0; j<n; j++) r[j] 	-= alpha[i] * Q[i*n+j];

		// weak orthogonality test
		for (int j=0; j<n; j++)	ortho 	+= fabs(conj(Q[j]) * Q[i*n+j]);



		//exit(0);
		// re-orthogonalize
		// r -= Q(Q^T * r)
		if ( ortho > _EVECS_NORM){

#ifdef _GATHER_SCALAR
			// need to gather into r
			int success = MPI_Allgather((void*) &r[myOffset], m, MPI_LONG_DOUBLE, \
					(void*) r, m, MPI_LONG_DOUBLE,MPI_COMM_WORLD);


#ifdef _DEBUG_LANCZOS
			if (success != MPI_SUCCESS) {

				char error_string[256];
				int length_of_error_string;

				MPI_Error_string(success, error_string, &length_of_error_string);
				fprintf(stderr,"MPI_Allgather failed in file %s around line %d with code : %s\n",__FILE__,__LINE__,error_string);
				MPI_Finalize();
				exit(1);
			}

#endif

#endif

			//if (1){

			char typet = 'C';
			zgemv_(&typet,&n,&subSize,&mula,Q,&dim,r,&inc,&mulb,scratch,&inc);
			zgemv_(&type,&n,&subSize,&mulc,Q,&dim,scratch,&inc,&mula,r,&inc);


		}

		// update beta
		if (i<subSize-1){

#ifndef _GATHER_SCALAR

			for (int j=0; j<n; j++) beta[i]	+= conj(r[j]) * r[j];	

#else

			beta_temp[commRank]=0.0+0.0*I;
			for (int j=0; j<m; j++) beta_temp[commRank] +=conj(r[j+myOffset]) * r[j+myOffset];
			int success = MPI_Allgather((void*) &beta_temp[commRank], 1, MPI_LONG_DOUBLE, \
					(void*) beta_temp, commSize-1, MPI_LONG_DOUBLE,MPI_COMM_WORLD);

			for (int j=0; j<commSize; j++) beta[i]+=beta_temp[j];


#endif
			beta[i] = sqrt(beta[i]);
		}

#else

		//lanczos_diagnostic(blocks,threads,d_r,d_Q,d_beta,d_alpha,n,i);
		cerror = lanczos_first_update(blocks, threads, d_r, d_Q, d_beta, n, i);
		check_cu_error("lanczos_first_update failed on host");

		//exit(0);
		cublasGetError();


		cublasZgemv(handle,CUBLAS_OP_N,m,n,&mula,d_A,m,&d_Q[i*n],1,&mulb,&d_r[myOffset],1);

		status = cublasGetError();
		check_cb_error("cublasZgemv failed on host");

		// need to gather into r
		int success = MPI_Allgather((void*) &d_r[myOffset], m, MPI_LONG_DOUBLE, (void*) d_r, m, MPI_LONG_DOUBLE,MPI_COMM_WORLD);


#ifdef _DEBUG_LANCZOS
		if (success != MPI_SUCCESS) {

			char error_string[256];
			int length_of_error_string;

			MPI_Error_string(success, error_string, &length_of_error_string);
			fprintf(stderr,"gpu MPI_Allgather failed in file %s around line %d with code %s\n",__FILE__,__LINE__,error_string);
			MPI_Finalize();
			exit(1);
		}

#endif


		int ind = i; //(commSize==1) ? i-1 : i;
		cerror = lanczos_second_update(blocks, threads, d_r, d_Q, d_beta, n, i, ind);
		check_cu_error("lanczos_second_update failed on host");

		cerror = vector_dot(d_Q,d_r,d_output,&d_alpha[i],1,n,i*n,0,0);
		check_cu_error("vector_dot failed on host");

		cerror = lanczos_third_update(blocks, threads, d_r, d_Q, d_alpha, n, i);
		check_cu_error("lanczos_third_update failed on host");

		if (i<subSize-1){
			cerror = vector_dot(d_r,d_r,d_output,&d_beta[i],1,n,0,0,1);
		}

		check_cu_error("vector_dot failed on host");



		// crude orthogonality test
		//
		cerror = vector_dot(d_Q,d_Q,d_output,d_ortho,1,n,0,i*n,1);
		check_cu_error("vector_dot failed on host");

		//lanczos_diagnostic(blocks,threads,d_r,d_Q,d_beta,d_alpha,n,i);

		cudaMemcpy(&ortho,&d_ortho,sizeof(double), cudaMemcpyDeviceToHost);


		if (fabs(ortho) > _EVECS_NORM){
			//if (0){


			cublasGetError();

			cublasZgemv(handle,CUBLAS_OP_T,n,subSize,&mula,d_Q,dim,d_r,1,&mulb,d_output,1);
			cublasZgemv(handle,CUBLAS_OP_N,n,subSize,&mula,d_Q,dim,d_output,1,&mulb,d_output,1);

			status = cublasGetError();
			check_cb_error("cublasZgemv failed on host");

			cerror = lanczos_fourth_update(blocks, threads, d_r, d_output, n);
			check_cu_error("lanczos_fourth_update failed on host");
		}



#endif
		}

#ifdef _USE_GPU

		if (commRank==0){

			cerror = cudaMemcpy(alpha,d_alpha,sizeof(cuDoubleComplex) * subSize, cudaMemcpyDeviceToHost);
			check_cu_error("cudaMemcpy of d_alpha to host");
			cerror = cudaMemcpy(beta,d_beta,sizeof(cuDoubleComplex) * (subSize-1), cudaMemcpyDeviceToHost);
			check_cu_error("cudaMemcpy of d_beta to host");
			cerror = cudaMemcpy(Q,d_Q,sizeof(cuDoubleComplex) * subSize*n, cudaMemcpyDeviceToHost);
			check_cu_error("cudaMemcpy of d_Q to host");

		}
		cudaFree(d_alpha);
		cudaFree(d_output);
		cudaFree(d_beta);
		cudaFree(d_Q);
		cudaFreeHost(d_r);
		cudaFree(d_A);

#endif


#ifdef _DEBUG_LANCZOS
		if (commRank==0){

			printf("alpha & beta :\n");
			for (int i=0; i<subSize; i++)
				printf("%f+%fi ",creal(alpha[i]),cimag(alpha[i]));
			printf("\n");
			for (int i=0; i<subSize-1; i++)
				printf("%f+%fi ",creal(beta[i]),cimag(beta[i]));
			printf("\n");
		}
#endif
		// calculate spectrum of (now) tridiagonal matrix

		double * alp = (double*) malloc(sizeof(double) * subSize);
		double * bet = (double*) malloc(sizeof(double) * (subSize-1));

		for (int i=0; i<subSize; i++) alp[i] = creal(alpha[i]);
		for (int i=0; i<(subSize-1); i++) bet[i] = creal(beta[i]);

#ifdef _CALC_EVECS

		complex double * evecs_lan = (complex double*) malloc(sizeof(complex double) * subSize * subSize);


		type = 'I';

		zsteqr_(&type,&subSize,alp,bet,evecs_lan,&subSize,(double*) evecs,&info);

		type = 'N';

		for (int i=0; i<subSize; i++)
			zgemv_(&type,&n,&subSize,&mula,Q,&n,&evecs_lan[i*subSize],&inc,&mulb,&evecs[i*n],&inc);

		free(evecs_lan);
#else

		dsterf_(&subSize,alp,bet,&info);
		free(bet);

#endif

		for (int i=0; i<subSize; i++) evals[i] = alp[i];

#ifdef _DEBUG_LANCZOS

		if (commRank==0){
			printf("evals :\n");

			for (int i=0; i<subSize; i++)
				printf("%f ",evals[i]);
			printf("\n");

		}
#endif


		free(alp); 
		free(alpha); 	
		free(beta);
#ifndef _USE_GPU
		free(r);
#endif
		free(Q);
		}
int main(){

	int m = 10;
	int n = m;
	cudaError_t cudaStat ; // cudaMalloc status
	cublasStatus_t stat ; // CUBLAS functions status
	cublasHandle_t handle ; // CUBLAS context
	int i,j; // i-row index , j-col. index
	double * a; // mxm matrix a on the host
	double * b; // mxm matrix b on the host
	double * c; // mxm matrix c on the host
	a=( double *) malloc (m*m* sizeof ( double )); // host memory for a

	b=( double *) malloc (m*m* sizeof ( double )); // host memory for b
	c=( double *) malloc (m*m* sizeof ( double )); // host memory for b

	int ind =1; // a:
	for(j=0;j<m;j ++){ // 11
		for(i=0;i<m;i ++){ // 12 ,17
	//		if(i >=j){ // 13 ,18 ,22
				a[ IDX2C(i,j,m)]=( float )ind ++; // 14 ,19 ,23 ,26
	//		} // 15 ,20 ,24 ,27 ,29
		} // 16 ,21 ,25 ,28 ,30 ,31
	}
	 printf (" lower triangle of a:\n");
	for (i=0;i<m;i ++){
		for (j=0;j<m;j ++){
	//		if(i >=j)
				printf (" %5.0f",a[ IDX2C(i,j,m)]);
		}
		printf ("\n");
	} 

	ind =11; // b:
	for(j=0;j<n;j ++){ // 11 ,17 ,23 ,29 ,35
		for(i=0;i<m;i ++){ // 12 ,18 ,24 ,30 ,36
			if(i == j)
			b[ IDX2C(i,j,m)] = 1.0; // 13 ,19 ,25 ,31 ,37
			else 
			b[ IDX2C(i, j, m)] = 2.0;
			//ind ++; // 14 ,20 ,26 ,32 ,38
		} // 15 ,21 ,27 ,33 ,39
	} // 16 ,22 ,28 ,34 ,40
	 printf ("b:\n");
	for (i=0;i<m;i ++){
		for (j=0;j<n;j ++){
			printf (" %5.0f",b[IDX2C(i,j,m)]); // print b row by row
		}
		printf ("\n");
	} 

	double * d_a; // d_a - a on the device
	double * d_b; // d_b - b on the device
	double * d_c; // d_c - c on the devicde
	cudaStat = cudaMalloc (( void **)& d_a ,m*m* sizeof (*a)); // device memory alloc for a
	cudaStat = cudaMalloc (( void **)& d_b ,m*m* sizeof (*b)); // device memory alloc for b
	cudaStat = cudaMalloc (( void **)& d_c ,m*m* sizeof (*c)); // device memory alloc for c

	stat = cublasCreate (& handle ); // initialize CUBLAS context

	stat = cublasSetMatrix (m,m, sizeof (*a) ,a,m,d_a ,m); //a -> d_a
	stat = cublasSetMatrix (m,m, sizeof (*b) ,b,m,d_b ,m); //b -> d_b

	double startime = CycleTimer::currentSeconds();
	gpu_blas_mmul(d_a, d_b, d_c, m, m, m);
	double endtime = CycleTimer::currentSeconds();
	
	stat = cublasGetMatrix (m,n, sizeof (*c) ,d_c ,m,c,m); // d_b -> b
	 printf (" solution x from Strsm :\n");
	for(i=0;i<m;i ++){
		for(j=0;j<n;j ++){
			printf (" %11.5f",c[IDX2C(i,j,m )]); // print b after Strsm
		}
		printf ("\n");
	} 
	cudaFree (d_a ); // free device memory
	cudaFree (d_b ); // free device memory
	cudaFree (d_c ); // free device memory
	cublasDestroy ( handle ); // destroy CUBLAS context
	free (a); // free host memory
	free (b); // free host memory
	free (c); // free host memory

	printf("Time taken: %lf\n", endtime - startime);
	return EXIT_SUCCESS ;
}
Esempio n. 18
0
int main( int argc, char **argv )
{
	double *A, *B, *C;
	double *cu_A, *cu_B, *cu_C;

	cudaError_t    cuError;
	cublasStatus_t cuStatus;
	cublasHandle_t cuHandle;

	// seed rand()
	srand(time(NULL));

	// allocate memory on CPU
	A = (double*)malloc(sizeof(double)*MATRIX_N*MATRIX_P);
	B = (double*)malloc(sizeof(double)*MATRIX_P*MATRIX_M);
	C = (double*)malloc(sizeof(double)*MATRIX_N*MATRIX_M);

	if( !A || !B || !C )
	{
		perror("Can't allocate CPU matrices");
		exit(EXIT_FAILURE);
	}

	// generate matrices
	for( int i = 0; i < MATRIX_N*MATRIX_P; i++ )
		A[i] = 10.0*((double)rand())/RAND_MAX;

	for( int i = 0; i < MATRIX_P*MATRIX_M; i++ )
		B[i] = 10.0*((double)rand())/RAND_MAX;

	// allocate memory on GPU
	cuError = cudaMalloc( &cu_A, sizeof(double)*MATRIX_N*MATRIX_P );

	if( cuError != cudaSuccess )
	{
		fprintf(stderr, "Can't allocate GPU matrices\n");
		exit(EXIT_FAILURE);
	}

	cuError = cudaMalloc( &cu_B, sizeof(double)*MATRIX_P*MATRIX_M );

	if( cuError != cudaSuccess )
	{
		fprintf(stderr, "Can't allocate GPU matrices\n");
		exit(EXIT_FAILURE);
	}

	cuError = cudaMalloc( &cu_C, sizeof(double)*MATRIX_N*MATRIX_M );

	if( cuError != cudaSuccess )
	{
		fprintf(stderr, "Can't allocate GPU matrices\n");
		exit(EXIT_FAILURE);
	}

	// setup cuBlas
	cuStatus = cublasCreate( &cuHandle );
	if( cuStatus != CUBLAS_STATUS_SUCCESS )
	{
		fprintf(stderr, "Error initializing cuBlas\n");
		exit(EXIT_FAILURE);
	}

	// setup matrices
	cuStatus = cublasSetMatrix( MATRIX_N, MATRIX_P, sizeof(double), A, MATRIX_N, cu_A, MATRIX_N );
	if( cuStatus != CUBLAS_STATUS_SUCCESS )
	{
		fprintf(stderr, "Error transferring matrix A\n");
		exit(EXIT_FAILURE);
	}

	cuStatus = cublasSetMatrix( MATRIX_P, MATRIX_M, sizeof(double), B, MATRIX_P, cu_B, MATRIX_P );
	if( cuStatus != CUBLAS_STATUS_SUCCESS )
	{
		fprintf(stderr, "Error transferring matrix B\n");
		exit(EXIT_FAILURE);
	}

	// multiply
	double one  = 1.0;
	double zero = 0.0;
	cuStatus = cublasDgemm( cuHandle, CUBLAS_OP_N, CUBLAS_OP_N, MATRIX_N, MATRIX_M, MATRIX_P, &one, cu_A, MATRIX_N, cu_B, MATRIX_P, &zero, cu_C, MATRIX_N );

	if( cuStatus != CUBLAS_STATUS_SUCCESS )
	{
		fprintf(stderr, "Error executing matrix mult\n");
		exit(EXIT_FAILURE);
	}

	// get results
	cuStatus = cublasGetMatrix( MATRIX_N, MATRIX_M, sizeof(double), cu_C, MATRIX_N, C, MATRIX_N );
	if( cuStatus != CUBLAS_STATUS_SUCCESS )
	{
		fprintf(stderr, "Error transferring results\n");
		exit(EXIT_FAILURE);
	}
	
	// check results
	bool good = true;
	for( int i = 0; i < MATRIX_N; i++ )
	{
		for( int j = 0; j < MATRIX_M; j++ )
		{
			double sum = 0.0;
			for( int k = 0; k < MATRIX_P; k++ )
			{
				sum += A[IDX2C(i, k, MATRIX_N)]*B[IDX2C(k, j, MATRIX_P)];
			}
			// check
			if( fabs(sum - C[IDX2C(i,j,MATRIX_N)]) > 0.00001 )
			{
				good = false;
				printf("(%i, %i) sum = %f\tcu_C = %f\tMISMATCH\n", i, j, sum, C[IDX2C(i,j,MATRIX_N)]);
			}
		}
	}

	if( good )
		printf("Results Match\n");
	else
		printf("Results DO NOT Match\n");

	// cleanup
	free( A ); free( B ); free( C );
	cudaFree( cu_A ); cudaFree( cu_B ); cudaFree( cu_C );
	cublasDestroy( cuHandle );

	return 0;
}
void Cpsgecopy_general_async(int m, int n, 
        void *A, int ia, int ja, int *descA,
        void *B, int ib, int jb, int *descB, int is_device_to_host)
{
#define dA(i,j)  (((float*)A) + IDX2F(i,j,descA[LLD_]))
#define dT(i,j) (((float *)T) + IDX2F(i,j,descT[LLD_]))

#define dB(i,j)  (((float *)B) + IDX2F(i,j,descB[LLD_]))
/*
  perform    copy

  B( ib:(ib+m-1), jb:(jb+n-1)) <-  A( ia:(ia+m-1),ja:(ja+n-1))

 */

const int use_MallocHost = FALSE;
const int use_igsum2d = FALSE;

cublasStatus cu_status;

cudaError_t cuda_status;

char notrans[] = "NoTrans";

int descT[DLEN_];

int ldA,ldB,ldT;

int is_same_context, is_same_mb, is_same_nb;
int is_same_p, is_same_q;
int is_same_offset;
int is_same_Locp, is_same_Locq;
int is_aligned;

int lrA1,lcA1, lrA2,lcA2;
int lrT1,lcT1, lrT2,lcT2;
int lrB1,lcB1, lrB2,lcB2;
int rsrc,csrc;
int rsrcA1,csrcA1,  rsrcA2, csrcA2;
int rsrcB1,csrcB1,  rsrcB2, csrcB2;
int iia,jja, iib,jjb;
int icontxt, nprow,npcol, myprow,mypcol;
int LocpA,LocqA,  LocpB,LocqB, LocpT,LocqT;
int mm,nn, lmm,lnn;
size_t nbytes;

float one_[REAL_PART+IMAG_PART+1];
float *one = &(one_[0]);

float zero_[REAL_PART+IMAG_PART+1];
float *zero = &(zero_[0]);


float alpha_[REAL_PART+IMAG_PART+1];
float *alpha = &(alpha_[0]);

float beta_[REAL_PART+IMAG_PART+1];
float *beta = &(beta_[0]);

int isize, isizeT;
float *T = 0;

int elemSize = sizeof(float);
int nnb, jstart,jend,jsize;
int is_ok;

int nmax;
const int bufsize =  1024*1024;
const int use_simple = FALSE;;

one[REAL_PART] = 1.0;
one[IMAG_PART] = 0.0;
zero[REAL_PART] = 0.0;
zero[IMAG_PART] = 0.0;

 if ((m <= 0) || (n <= 0)) {
   return;
 };

 T = 0; 

 ldA = descA[LLD_];
 ldB = descB[LLD_];

  icontxt = descA[CTXT_];
  Cblacs_gridinfo( icontxt, &nprow,&npcol, &myprow, &mypcol);
  assert( nprow >= 1);
  assert( npcol >= 1);
  assert( (0 <= myprow) && (myprow < nprow));
  assert( (0 <= mypcol) && (mypcol < npcol));

  is_ok = (1 <= ia) && (ia + m-1 <= descA[M_]);
  if (!is_ok) {
    printf("Cpsgecopy (%d,%d) :ia %d m %d descA[M_] %d  \n",
            myprow,mypcol,     ia,   m,   descA[M_] );
    printf("Cpsgecopy (%d,%d) :ja %d n %d descA[N_] %d \n",
            myprow,mypcol,     ja,   n,   descA[N_] );
    printf("Cpsgecopy (%d,%d) :ib %d jb %d descB[M_] %d descB[N_] %d\n",
            myprow,mypcol,     ib,   jb,   descB[M_],   descB[N_] );
  };
  assert( (1 <= ia) && (ia + m-1 <= descA[M_]));
  assert( (1 <= ja) && (ja + n-1 <= descA[N_]));
  assert( (1 <= ib) && (ib + m-1 <= descB[M_]));
  assert( (1 <= jb) && (jb + n-1 <= descB[N_]));


  is_same_context = (descA[CTXT_] == descB[CTXT_]);
  is_same_mb = (descA[MB_] == descB[MB_]);
  is_same_nb = (descA[NB_] == descB[NB_]);

  is_same_p = (Cindxg2p(ia,descA[MB_], myprow, descA[RSRC_],nprow) ==
               Cindxg2p(ib,descB[MB_], myprow, descB[RSRC_],nprow) );

  is_same_q = (Cindxg2p(ja,descA[NB_], mypcol, descA[CSRC_],npcol) ==
               Cindxg2p(jb,descB[NB_], mypcol, descB[CSRC_],npcol) );

  is_same_offset = (MOD(ia,descA[MB_]) == MOD(ib,descB[MB_])) &&
                   (MOD(ja,descA[NB_]) == MOD(jb,descB[NB_]));


  local_extent( m,n, ia,ja,descA, &LocpA,&LocqA, &lrA1,&lcA1, &lrA2,&lcA2 );


  local_extent( m,n, ib,jb,descB, &LocpB,&LocqB,&lrB1,&lcB1, &lrB2,&lcB2 );



  /*
  if ((LocpA >= 1) || (LocpB >= 1)) {
     is_same_Locp = (LocpA == LocpB);
  };
  if ((LocqA >= 1) || (LocqB >= 1)) {
     is_same_Locq = (LocqA == LocqB);
  };
  */

  is_same_Locq = (LocqA == LocqB);

  is_same_Locp = (LocpA == LocpB);

  is_aligned = is_same_context &&
               is_same_mb && is_same_nb &&
               is_same_p && is_same_q &&
               is_same_offset &&
               is_same_Locp && is_same_Locq;

  assert( is_same_q );

  assert( is_same_p );

  assert( is_same_offset );

  assert( is_same_Locp );

  assert( is_same_Locq );

  assert( is_aligned );


        
       /*
        no communication required
        copy from device to host
        */

       ldA = descA[LLD_];
       ldB = descB[LLD_];

       mm = LocpA;
       nn = LocqA;

       if (is_device_to_host) {
         /* 
          * transfer from device to host
          */
         if ( (mm >= 1) && (nn >= 1) ) {
#ifdef USE_CUBLASV2
           {
             cublasStatus_t istatus;
             istatus = cublasGetMatrixAsync(mm, nn, elemSize, 
                 (void *) dA(lrA1,lcA1), ldA, (void *) dB(lrB1,lcB1), ldB,
                 cublas_get_stream() );
             assert( istatus == CUBLAS_STATUS_SUCCESS );
           }
#else
           cu_status = cublasGetMatrix(mm,nn, elemSize,
               (void *) dA(lrA1,lcA1), ldA,  (void *) dB(lrB1,lcB1),ldB );
            CHKERR(cu_status);
#endif
           };
         }
       else {
         /* 
          * transfer from host to device
          */
         if ( (mm >= 1) && (nn >= 1) ) {
#ifdef USE_CUBLASV2
           {
             cublasStatus_t istatus;

             istatus = cublasSetMatrixAsync(mm,nn,elemSize,
               (void *) dA(lrA1,lcA1), ldA,  (void *) dB(lrB1,lcB1),ldB,
               cublas_get_stream()   );

             assert( istatus == CUBLAS_STATUS_SUCCESS );

           }
#else
            cu_status = cublasSetMatrix(mm,nn,elemSize,
               (void *) dA(lrA1,lcA1), ldA,  (void *) dB(lrB1,lcB1),ldB );
            CHKERR(cu_status);
#endif
           };
         };
               



  return;
}
void Cpcswap_gpu( int n, cuComplex *A, int ia,int ja,int *descA, int incA,
                    cuComplex *B, int ib,int jb,int *descB, int incB )
{
/*
 perform pcswap operation when
 both distributed arrays A and B are in device memory
 */



/*
 * allocate temporary space on host
 * then use pcswap for communication
 */

const int use_MallocHost = FALSE;

cublasStatus cu_status;
size_t nbytes;
int elemSize = sizeof( cuComplex );

float *Atmp = 0;
float *Btmp = 0;

int descAtmp[DLEN_];
int descBtmp[DLEN_];
int ldA, ldB, ldAtmp, ldBtmp;

int nprow,npcol,myprow,mypcol;
int Locp, Locq, lrindx, lcindx, mm,nn;
int LocpA, LocqA, lrindxA, lcindxA;
int LocpB, LocqB, lrindxB, lcindxB;
int isizeA, isizeB, rsrc, csrc;

int iia,jja, iib, jjb;
int incAtmp, incBtmp;
int lrA1,lcA1, lrA2,lcA2;
int lrB1,lcB1, lrB2,lcB2;

Cblacs_gridinfo( descA[CTXT_], &nprow, &npcol, &myprow, &mypcol );


/*
 * allocate storage for vector from A
 */

if (incA == 1) {
   /*
    *  This is a column vector
    */
   mm = n; nn = 1;
   }
else {
  /*
   * This is a row vector
   */
   mm = 1; nn = n;
   };
setup_desc(  mm,nn, ia,ja, descA,   &isizeA, descAtmp );

nbytes = elemSize;
nbytes *= isizeA;
if (use_MallocHost) {
  Atmp = (float *) MallocHost( nbytes );
  }
else {
  Atmp = (float *) malloc( nbytes );
  };
assert( Atmp != 0 );


/*
 * copy vector from A
 */

PROFSTART("swap:GetMatrix");

local_extent( mm,nn,ia,ja,descA,  &LocpA, &LocqA, &lrA1,&lcA1, &lrA2,&lcA2 );

lrindxA = lrA1;
lcindxA = lcA1;

ldA = descA[LLD_];
ldAtmp = descAtmp[LLD_];
if ( (LocpA >= 1) && (LocqA >= 1)) {
  /*
   * copy from GPU device to host CPU
   */
  cu_status = cublasGetMatrix( LocpA,LocqA, elemSize,
             dA(lrindxA,lcindxA), ldA,  Atmp, ldAtmp );

  CHKERR(cu_status);
  };



/*
 * allocate storage for vector from B
 */

Cblacs_gridinfo( descB[CTXT_], &nprow, &npcol, &myprow, &mypcol );

if (incB == 1) {
   /*
    *  This is a column vector
    */
   mm = n; nn = 1;
   }
else {
  /*
   * This is a row vector
   */
   mm = 1; nn = n;
   };
setup_desc( mm,nn, ib,jb,descB, &isizeB, descBtmp );

ldBtmp = descBtmp[LLD_];
ldB = descB[LLD_];

nbytes = elemSize;
nbytes *= isizeB;
if (use_MallocHost) {
  Btmp = (float *) MallocHost( nbytes );
  }
else {
  Btmp = (float *) malloc( nbytes );
  };
assert( Btmp != 0 );



/*
 * copy vector from B
 */

local_extent( mm,nn,ib,jb,descB,  &LocpB, &LocqB, &lrB1,&lcB1,  &lrB2,&lcB2 );

lrindxB = lrB1;
lcindxB = lcB1;



ldB = descB[LLD_];
ldBtmp = descBtmp[LLD_];
if ((LocpB >= 1) && (LocqB >= 1)) {
  /*
   * Copy from GPU to CPU host
   */
  cu_status = cublasGetMatrix(LocpB,LocqB,elemSize,
         dB(lrindxB,lcindxB), ldB, Btmp, ldBtmp );
  CHKERR(cu_status );
  };

PROFEND("swap:GetMatrix");

iia = 1; jja = 1;
iib = 1; jjb = 1;
if (incA == 1) {
   incAtmp = 1;
   }
else {
  incAtmp = descAtmp[M_];
};

if (incB == 1) {
   incBtmp = 1;
    }
else {
   incBtmp = descBtmp[M_];
};


PROFSTART("swap:pcswap");
scalapack_pcswap( &n, Atmp, &iia, &jja, descAtmp, &incAtmp,
                      Btmp, &iib, &jjb, descBtmp, &incBtmp );
PROFEND("swap:pcswap");


/*
 * copy from host CPU back to GPU
 */

PROFSTART("swap:SetMatrix");

if ((LocpA >= 1) && (LocqA >= 1)) {
  /*
   * Copy from CPU host to GPU device
   */
  cu_status = cublasSetMatrix( LocpA, LocqA, elemSize,
              Atmp, ldAtmp, dA(lrindxA,lcindxA), ldA );
  CHKERR(cu_status);
  };


if ((LocpB >= 1) && (LocqB >= 1)) {
  /*
   * Copy from CPU host to GPU device
   */
  cu_status = cublasSetMatrix( LocpB, LocqB, elemSize,
                 Btmp, ldBtmp, dB(lrindxB,lcindxB), ldB );
  CHKERR(cu_status);
  };

PROFEND("swap:SetMatrix");

/*
 * clean up
 */


if (Atmp != 0) {
  if (use_MallocHost) {
    FreeHost(Atmp);
    }
  else {
    free(Atmp); 
    };
  Atmp = 0;
  };

if (Btmp != 0) {
  if (use_MallocHost) {
    FreeHost(Btmp);
     }
  else {
    free(Btmp); 
  };
  Btmp = 0;
 };



return;


}
Esempio n. 21
0
int main ( void ){
	cudaError_t cudaStat ; // cudaMalloc status
	cublasStatus_t stat ; // CUBLAS functions status
	cublasHandle_t handle ; // CUBLAS context
	int i,j; // i-row index , j-col. index
	double * a; // mxm matrix a on the host
	double * b; // mxn matrix b on the host
	a=( double *) malloc (m*m* sizeof ( double )); // host memory for a

	b=( double *) malloc (m*n* sizeof ( double )); // host memory for b

	int ind =11; // a:
	for(j=0;j<m;j ++){ // 11
		for(i=0;i<m;i ++){ // 12 ,17
			if(i >=j){ // 13 ,18 ,22
				a[ IDX2C(i,j,m)]=( double )ind ++; // 14 ,19 ,23 ,26
			} // 15 ,20 ,24 ,27 ,29
		} // 16 ,21 ,25 ,28 ,30 ,31
	}
	 printf (" lower triangle of a:\n");
/*	for (i=0;i<m;i ++){
		for (j=0;j<m;j ++){
			if(i >=j)
				printf (" %5.0f",a[ IDX2C(i,j,m)]);
		}
		printf ("\n");
	} */

	ind =11; // b:
	for(j=0;j<n;j ++){ // 11 ,17 ,23 ,29 ,35
		for(i=0;i<m;i ++){ // 12 ,18 ,24 ,30 ,36
			if(i == j) b[IDX2C(i,i,m)] =  1.0;
			else b[IDX2C(i,j,m)] = 0.0;
		}
	}
		
		//	b[ IDX2C(i,j,m)] = ind++;
			/*if(i == j)
			b[ IDX2C(i,j,m)] = 1.0; // 13 ,19 ,25 ,31 ,37
			else 
			b[ IDX2C(i, j, m)] = 0.0;*/
			//ind ++; // 14 ,20 ,26 ,32 ,38
	 printf ("b:\n");
/*	for (i=0;i<m;i ++){
		for (j=0;j<n;j ++){
			printf (" %5.0f",b[IDX2C(i,j,m)]); // print b row by row
		}
		printf ("\n");
	} */

	double * d_a; // d_a - a on the device
	double * d_b; // d_b - b on the device
	cudaStat = cudaMalloc (( void **)& d_a ,m*m* sizeof (*a)); // device
	// memory alloc for a
	cudaStat = cudaMalloc (( void **)& d_b ,m*n* sizeof (*b)); // device
	// // memory alloc for b
	stat = cublasCreate (& handle ); // initialize CUBLAS context

	stat = cublasSetMatrix (m,m, sizeof (*a) ,a,m,d_a ,m); //a -> d_a
	stat = cublasSetMatrix (m,n, sizeof (*b) ,b,m,d_b ,m); //b -> d_b
	double al =1.0f;

	double startime = CycleTimer::currentSeconds();
	(cublasDtrsm(handle,CUBLAS_SIDE_LEFT,CUBLAS_FILL_MODE_LOWER,
			CUBLAS_OP_N,CUBLAS_DIAG_NON_UNIT,m,n,&al,d_a,m,d_b,m));
	stat = cublasGetMatrix (m,n, sizeof (*b) ,d_b ,m,b,m); // d_b -> b
	double endtime = CycleTimer::currentSeconds();
	printf (" solution x from Strsm :\n");
/*	for(i=0;i<m;i ++){
		for(j=0;j<n;j ++){
			printf (" %11.5f",b[IDX2C(i,j,m )]); // print b after Strsm
		}
		printf ("\n");
	} */
	cudaFree (d_a ); // free device memory
	cudaFree (d_b ); // free device memory
	cublasDestroy ( handle ); // destroy CUBLAS context
	free (a); // free host memory
	free (b); // free host memory

	printf("Time taken: %lf\n", endtime - startime);
	return EXIT_SUCCESS ;
}
int main(int argc, char* argv[])
{
    int i,j,k,index;

    // Linear dimension of matrices
    int dim = 100;

    // Number of A,B,C matrix sets
    int batch_count = 1000;

    // Allocate host storage for batch_count A,B,C square matrices
    double **A, **B, **C;
    A = (double**)malloc(batch_count*sizeof(double*));
    B = (double**)malloc(batch_count*sizeof(double*));
    C = (double**)malloc(batch_count*sizeof(double*));
    for(i=0; i<batch_count; i++) {
        A[i] = (double*)malloc(dim*dim*sizeof(double));
        B[i] = (double*)malloc(dim*dim*sizeof(double));
        C[i] = (double*)malloc(dim*dim*sizeof(double));
    }

    // Create host pointer array to device matrix storage
    double **d_A, **d_B, **d_C, **h_d_A, **h_d_B, **h_d_C;
    h_d_A = (double**)malloc(batch_count*sizeof(double*));
    h_d_B = (double**)malloc(batch_count*sizeof(double*));
    h_d_C = (double**)malloc(batch_count*sizeof(double*));

    for(i=0; i<batch_count; i++) {
        cudaMalloc((void**)&h_d_A[i], dim*dim*sizeof(double));
        cudaMalloc((void**)&h_d_B[i], dim*dim*sizeof(double));
        cudaMalloc((void**)&h_d_C[i], dim*dim*sizeof(double));
    }
    // Copy the host array of device pointers to the device
    cudaMalloc((void**)&d_A, batch_count*sizeof(double*));
    cudaMalloc((void**)&d_B, batch_count*sizeof(double*));
    cudaMalloc((void**)&d_C, batch_count*sizeof(double*));
    cudaMemcpy(d_A, h_d_A, batch_count*sizeof(double*), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_d_B, batch_count*sizeof(double*), cudaMemcpyHostToDevice);
    cudaMemcpy(d_C, h_d_C, batch_count*sizeof(double*), cudaMemcpyHostToDevice);

    // Fill A,B diagonals with k*sin(i) data, C diagonal with k*cos(i)^2
    // Matrices are arranged column major
    for(k=0; k<batch_count; k++) {
        for(j=0; j<dim; j++) {
            for(i=0; i<dim; i++) {
                index = j*dim + i;
                if(i==j) {
                    (A[k])[index] = k*sin(index);
                    (B[k])[index] = sin(index);
                    (C[k])[index] = k*cos(index)*cos(index);
                }
		else {
                    (A[k])[index] = 0.0;
                    (B[k])[index] = 0.0;
                    (C[k])[index] = 0.0;
                }
            } // i   
        } // j
    } // k

    // Create cublas instance
    cublasHandle_t handle;
    cublasCreate(&handle);

    // Set input matrices on device
    for(i=0; i<batch_count; i++) {
        cublasSetMatrix(dim, dim, sizeof(double), A[i], dim, h_d_A[i], dim);
        cublasSetMatrix(dim, dim, sizeof(double), B[i], dim, h_d_B[i], dim);
        cublasSetMatrix(dim, dim, sizeof(double), C[i], dim, h_d_C[i], dim);
    }

    // Set matrix coefficients
    double alpha = 1.0;
    double beta  = 1.0;

    // DGEMM: C = alpha*A*B + beta*C
    cublasDgemmBatched(handle,
                       CUBLAS_OP_N, CUBLAS_OP_N,
                       dim, dim, dim,
                       &alpha,
                       (const double**)d_A, dim,
                       (const double**)d_B, dim,
                       &beta,
                       d_C, dim,
                       batch_count);

    // Retrieve result matrix from device
    for(i=0; i<batch_count; i++)
        cublasGetMatrix(dim, dim, sizeof(double), h_d_C[i], dim, C[i], dim);

    // Simple sanity test, sum up all elements
    double sum = 0;
    for(k=0; k<batch_count; k++) {
        for(j=0; j<dim; j++) {
            for(i=0; i<dim; i++) {
                index = j*dim + i;
                sum += (C[k])[index];
            }
        }
    }
    printf("Element sum is: %f, should be: %d\n", sum, dim*(batch_count-1)*(batch_count)/2);   

    // Clean up resources

    for(i=0; i<batch_count; i++) {
        free(A[i]);
        free(B[i]);
        free(C[i]);
        cudaFree(h_d_A[i]);
        cudaFree(h_d_B[i]);
        cudaFree(h_d_C[i]);
    }

    free(A);
    free(B);
    free(C);
    free(h_d_A);
    free(h_d_B);
    free(h_d_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    cublasDestroy(handle);

    return 0;
}
Esempio n. 23
0
int main(int argc, char *argv[])
{
    int testN = 1;
    bool check_correctness = false;
    if (argc > 1) {
        testN = atoi(argv[1]);
    }
    if (argc > 2) {
        check_correctness = atoi(argv[2]);
    }

    std::cout << std::endl << "----------" << std::endl;
    std::cout << "Running sequential MM benchmark: testN: " << testN
              << ", check correctness: " << check_correctness
              << ", size: (" << S0 << ", " << S1 << ", " << S2 << ", " << S3 << ")" << std::endl;

    auto t1 = std::chrono::high_resolution_clock::now();
    auto t2 = t1;
    
    float *A = (float*) malloc(S0 * S1 * sizeof(float));
    float *B = (float*) malloc(S1 * S2 * sizeof(float));
    float *C = (float*) malloc(S2 * S3 * sizeof(float));

    // Initialize matrices with random values:
    for (int i = 0; i < S0 * S1; i++) A[i] = std::rand() % 10;
    for (int i = 0; i < S1 * S2; i++) B[i] = std::rand() % 10;
    for (int i = 0; i < S2 * S3; i++) C[i] = std::rand() % 10;

    std::cout << "Buffers initialized" << std::endl << std::flush;

    // Note that indices are flipped (see tutorial 2)
    Halide::Buffer<DATA_TYPE> A_buf(A, {S1, S0});
    Halide::Buffer<DATA_TYPE> B_buf(B, {S2, S1});
    Halide::Buffer<DATA_TYPE> C_buf(C, {S3, S2});
    Halide::Buffer<DATA_TYPE> O_buf(S3, S0);

    // Make a dummy call to set up GPU (initalization takes time)
    matmul(A_buf.raw_buffer(), B_buf.raw_buffer(), C_buf.raw_buffer(), O_buf.raw_buffer());

    // CPU Multiplication for correctness check

    if (check_correctness) {
        // Reference matrix multiplication

        std::cout << "Running CPU multiplication.." << std::endl;

        Halide::Buffer<DATA_TYPE> O_val_buf(S3, S0);
        Halide::Buffer<DATA_TYPE> T_val_buf(S2, S0);
        t1 = std::chrono::high_resolution_clock::now();
        for (int i = 0; i < S0; i++) {
            for (int k = 0; k < S2; k++) {
                // Note that indices are flipped (see tutorial 2)
                T_val_buf(k, i) = 0;
            }
        }
        for (int i = 0; i < S0; i++) {
            for (int l = 0; l < S3; l++) {
                // Note that indices are flipped (see tutorial 2)
                O_val_buf(l, i) = 0;
            }
        }
        for (int j = 0; j < S1; j++) {
            for (int i = 0; i < S0; i++) {
                for (int k = 0; k < S2; k++) {
                    // Note that indices are flipped (see tutorial 2)
                    T_val_buf(k, i) += A_buf(j, i) * B_buf(k, j);
                }
            }
        }
        for (int k = 0; k < S2; k++) {
            for (int i = 0; i < S0; i++) {
                for (int l = 0; l < S3; l++) {
                    // Note that indices are flipped (see tutorial 2)
                    O_val_buf(l, i) += T_val_buf(k, i) * C_buf(l, k);
                }
            }
        }
        t2 = std::chrono::high_resolution_clock::now();

        std::cout << "CPU matmul done: " << (std::chrono::duration<double,std::milli>(t2 - t1)).count() << "ms" << std::endl << std::flush;

        compare_buffers("matmul", O_buf, O_val_buf);
    }

    // GPU Multiplication

    t1 = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < testN; i++) {
        matmul(A_buf.raw_buffer(), B_buf.raw_buffer(), C_buf.raw_buffer(), O_buf.raw_buffer());
    }
    t2 = std::chrono::high_resolution_clock::now();

    std::cout << "GPU matmul done: " << (std::chrono::duration<double,std::milli>(t2 - t1)).count() / testN << "ms" << std::endl << std::flush;

    // CUBLAS SGEMM

    // Transposed copies for cublas
    float *A_T = (float*) malloc(S0 * S1 * sizeof(float));
    float *B_T = (float*) malloc(S1 * S2 * sizeof(float));
    float *C_T = (float*) malloc(S2 * S3 * sizeof(float));
    float *O_T = (float*) malloc(S0 * S3 * sizeof(float));
    // Transpose
    for (int i = 0; i < S0; i++) for (int j = 0; j < S1; j++) A_T[i + j * S0] = A[i * S1 + j];
    for (int i = 0; i < S1; i++) for (int j = 0; j < S2; j++) B_T[i + j * S1] = B[i * S2 + j];
    for (int i = 0; i < S2; i++) for (int j = 0; j < S3; j++) C_T[i + j * S2] = C[i * S3 + j];

    // Excluding handle creation which is time consuming
    cublasHandle_t handle;
    cublasCreate(&handle);

    t1 = std::chrono::high_resolution_clock::now();

    for (int i = 0; i < testN; i++) {
        float *d_A;
        float *d_B;
        float *d_C;
        float *d_T;
        float *d_O;
        cudaMalloc((void**)&d_A, S0 * S1 * sizeof(*A));
        cudaMalloc((void**)&d_B, S1 * S2 * sizeof(*A));
        cudaMalloc((void**)&d_C, S2 * S3 * sizeof(*A));
        cudaMalloc((void**)&d_T, S0 * S2 * sizeof(*A));
        cudaMalloc((void**)&d_O, S0 * S3 * sizeof(*A));

        cublasSetMatrix(S0, S1, sizeof(*A), A_T, S0, d_A, S0);
        cublasSetMatrix(S1, S2, sizeof(*B), B_T, S1, d_B, S1);
        cublasSetMatrix(S2, S3, sizeof(*C), C_T, S2, d_C, S2);

        float alpha_var = 1;
        float beta_var = 0;

        cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, S0, S2, S1, &alpha_var, d_A, S0, d_B, S1, &beta_var, d_T, S0);
        cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, S0, S3, S2, &alpha_var, d_T, S0, d_C, S2, &beta_var, d_O, S0);

        cublasGetMatrix(S0, S3, sizeof(*C), d_O, S0, O_T, S0);

        cudaFree(d_A);
        cudaFree(d_B);
        cudaFree(d_C);
        cudaFree(d_T);
        cudaFree(d_O);
    }

    t2 = std::chrono::high_resolution_clock::now();

    std::cout << "cublas matmul done (excluding cublasHandle creation): "
              << (std::chrono::duration<double,std::milli>(t2 - t1) / testN).count() << "ms" << std::endl << std::flush;

    cublasDestroy(handle);

    bool check_cublas_difference = false;
    if (check_cublas_difference) {
        bool flag = true;
        for (int i = 0; i < S0 && flag; i++) {
            for (int j = 0; j < S3; j++) {
                if (O_buf(j, i) != O_T[i + j * S0]) {
                    std::cout << "cublas validation mismatch:" << std::endl;
                    std::cout << i << " " << j << " " << O_T[i + j * S0] << " " << O_buf(j, i) << std::endl;
                }
            }
        }
        if (flag) {
            std::cout << "cublas and validation match" << std::endl;
        }
    }

    free(A);
    free(B);
    free(C);
    free(A_T);
    free(B_T);
    free(C_T);
    free(O_T);

    std::cout << "----------" << std::endl << std::endl;

    return 0;
}
Esempio n. 24
0
/* ========================================================================== */
int sci_gpuLU(char *fname)
{
    CheckRhs(1,2);
    CheckLhs(2,2);
    #ifdef WITH_CUDA
        cublasStatus status;
    #endif
    SciErr sciErr;
    int*    piAddr_A    = NULL;
    double* h_A         = NULL;
    double* hi_A        = NULL;
    int     rows_A;
    int     cols_A;

    int*    piAddr_Opt  = NULL;
    double* option      = NULL;
    int     rows_Opt;
    int     cols_Opt;

    void*   d_A         = NULL;
    int     na;
    void*   pvPtr       = NULL;

    int     size_A      = sizeof(double);
    bool    bComplex_A  = FALSE;
    int     inputType_A;
    int     inputType_Opt;
    double  res;
    int     posOutput   = 1;

    try
    {
        sciErr = getVarAddressFromPosition(pvApiCtx, 1, &piAddr_A);
        if(sciErr.iErr) throw sciErr;
        if(Rhs == 2)
        {
            sciErr = getVarAddressFromPosition(pvApiCtx, 2, &piAddr_Opt);
            if(sciErr.iErr) throw sciErr;
            sciErr = getVarType(pvApiCtx, piAddr_Opt, &inputType_Opt);
            if(sciErr.iErr) throw sciErr;
            if(inputType_Opt == sci_matrix)
            {
                sciErr = getMatrixOfDouble(pvApiCtx, piAddr_Opt, &rows_Opt, &cols_Opt, &option);
                if(sciErr.iErr) throw sciErr;
            }
            else
                throw "Option syntax is [number,number].";
        }
        else
        {
            rows_Opt=1;
            cols_Opt=2;
            option = (double*)malloc(2*sizeof(double));
            option[0]=0;
            option[1]=0;
        }

        if(rows_Opt != 1 || cols_Opt != 2)
            throw "Option syntax is [number,number].";

        if((int)option[1] == 1 && !isGpuInit())
            throw "gpu is not initialised. Please launch gpuInit() before use this function.";

        sciErr = getVarType(pvApiCtx, piAddr_A, &inputType_A);
        if(sciErr.iErr) throw sciErr;

        #ifdef WITH_CUDA
        if (useCuda())
        {
            if(inputType_A == sci_pointer)
            {
                sciErr = getPointer(pvApiCtx, piAddr_A, (void**)&pvPtr);
                if(sciErr.iErr) throw sciErr;

                gpuMat_CUDA* gmat;
                gmat = static_cast<gpuMat_CUDA*>(pvPtr);
				if(!gmat->useCuda)
					throw "Please switch to OpenCL mode before use this data.";
                rows_A=gmat->rows;
                cols_A=gmat->columns;
                if(gmat->complex)
                {
                    bComplex_A = TRUE;
                    size_A = sizeof(cuDoubleComplex);
                    d_A=(cuDoubleComplex*)gmat->ptr->get_ptr();
                }
                else
                    d_A=(double*)gmat->ptr->get_ptr();

                // Initialize CUBLAS
                status = cublasInit();
                if (status != CUBLAS_STATUS_SUCCESS) throw status;

                na = rows_A * cols_A;
            }
            else if(inputType_A == 1)
            {
                // Get size and data
                if(isVarComplex(pvApiCtx, piAddr_A))
                {
                    sciErr = getComplexMatrixOfDouble(pvApiCtx, piAddr_A, &rows_A, &cols_A, &h_A, &hi_A);
                    if(sciErr.iErr) throw sciErr;
                    size_A = sizeof(cuDoubleComplex);
                    bComplex_A = TRUE;
                }
                else
                {
                    sciErr = getMatrixOfDouble(pvApiCtx, piAddr_A, &rows_A, &cols_A, &h_A);
                    if(sciErr.iErr) throw sciErr;
                }

                na = rows_A * cols_A;

                // Initialize CUBLAS
                status = cublasInit();
                if (status != CUBLAS_STATUS_SUCCESS) throw status;

                // Allocate device memory
                status = cublasAlloc(na, size_A, (void**)&d_A);
                if (status != CUBLAS_STATUS_SUCCESS) throw status;

                // Initialize the device matrices with the host matrices
                if(!bComplex_A)
                {
                    status = cublasSetMatrix(rows_A,cols_A, sizeof(double), h_A, rows_A, (double*)d_A, rows_A);
                    if (status != CUBLAS_STATUS_SUCCESS) throw status;
                }
                else
                    writecucomplex(h_A, hi_A, rows_A, cols_A, (cuDoubleComplex *)d_A);

            }
            else
                throw "Bad argument type.";

            cuDoubleComplex resComplex;
            // Performs operation
            if(!bComplex_A)
                status = decomposeBlockedLU(rows_A, cols_A, rows_A, (double*)d_A, 1);
       //     else
       //         resComplex = cublasZtrsm(na,(cuDoubleComplex*)d_A);

            if (status != CUBLAS_STATUS_SUCCESS) throw status;

            // Put the result in scilab
            switch((int)option[0])
            {
                case 2 :
                case 1 :    sciprint("The first option must be 0 for this function. Considered as 0.\n");

                case 0 :    // Keep the result on the Host.
                {           // Put the result in scilab
                    if(!bComplex_A)
                    {
                        double* h_res = NULL;
                        sciErr=allocMatrixOfDouble(pvApiCtx, Rhs + posOutput, rows_A, cols_A, &h_res);
                        if(sciErr.iErr) throw sciErr;
                        status = cublasGetMatrix(rows_A,cols_A, sizeof(double), (double*)d_A, rows_A, h_res, rows_A);
                        if (status != CUBLAS_STATUS_SUCCESS) throw status;
                    }
                    else
                    {
                        sciErr = createComplexMatrixOfDouble(pvApiCtx, Rhs + posOutput, 1, 1, &resComplex.x,&resComplex.y);
                        if(sciErr.iErr) throw sciErr;
                    }

                    LhsVar(posOutput)=Rhs+posOutput;
                    posOutput++;
                    break;
                }

                default : throw "First option argument must be 0 or 1 or 2.";
            }

            switch((int)option[1])
            {
                case 0 :    // Don't keep the data input on Device.
                {
                    if(inputType_A == sci_matrix)
                    {
                        status = cublasFree(d_A);
                        if (status != CUBLAS_STATUS_SUCCESS) throw status;
                        d_A = NULL;
                    }
                    break;
                }
                case 1 :    // Keep data of the fisrt argument on Device and return the Device pointer.
                {
                    if(inputType_A == sci_matrix)
                    {
                        gpuMat_CUDA* dptr;
                        gpuMat_CUDA tmp={getCudaContext()->genMatrix<double>(getCudaQueue(),rows_A*cols_A),rows_A,cols_A};
                        dptr=new gpuMat_CUDA(tmp);
						dptr->useCuda = true;
                        dptr->ptr->set_ptr((double*)d_A);
                        if(bComplex_A)
                            dptr->complex=TRUE;
                        else
                            dptr->complex=FALSE;

                        sciErr = createPointer(pvApiCtx,Rhs+posOutput, (void*)dptr);
                        if(sciErr.iErr) throw sciErr;
                        LhsVar(posOutput)=Rhs+posOutput;
                    }
                    else
                        throw "The first input argument is already a GPU variable.";

                    posOutput++;
                    break;
                }

                default : throw "Second option argument must be 0 or 1.";
            }
            // Shutdown
            status = cublasShutdown();
            if (status != CUBLAS_STATUS_SUCCESS) throw status;
        }
        #endif

        #ifdef WITH_OPENCL
        if (!useCuda())
        {
            throw "not implemented with OpenCL.";
        }
        #endif
        if(Rhs == 1)
        {
            free(option);
            option = NULL;
        }

        if(posOutput < Lhs+1)
            throw "Too many output arguments.";

        if(posOutput > Lhs+1)
            throw "Too few output arguments.";

        PutLhsVar();
        return 0;
    }
    catch(const char* str)
    {
        Scierror(999,"%s\n",str);
    }
    catch(SciErr E)
    {
        printError(&E, 0);
    }
    #ifdef WITH_CUDA
    catch(cudaError_t cudaE)
    {
        GpuError::treat_error<CUDAmode>((CUDAmode::Status)cudaE);
    }
    catch(cublasStatus CublasE)
    {
        GpuError::treat_error<CUDAmode>((CUDAmode::Status)CublasE,1);
    }
    if (useCuda())
    {
        if(inputType_A == 1 && d_A != NULL) cudaFree(d_A);
    }
    #endif
    #ifdef WITH_OPENCL
    if (!useCuda())
    {
        Scierror(999,"not implemented with OpenCL.\n");
    }
    #endif
    if(Rhs == 1 && option != NULL) free(option);
    return EXIT_FAILURE;
}
Esempio n. 25
0
extern "C" magma_int_t
magma_zgeqrf2(magma_context *cntxt, magma_int_t m, magma_int_t n, 
          cuDoubleComplex *a,    magma_int_t lda, cuDoubleComplex *tau, 
          cuDoubleComplex *work, magma_int_t lwork,
          magma_int_t *info)
{
/*  -- MAGMA (version 1.5.0-beta3) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date July 2014

    Purpose
    =======
    ZGEQRF computes a QR factorization of a COMPLEX_16 M-by-N matrix A:
    A = Q * R. This version does not require work space on the GPU
    passed as input. GPU memory is allocated in the routine.

    Arguments
    =========
    CNTXT   (input) MAGMA_CONTEXT
            CNTXT specifies the MAGMA hardware context for this routine.

    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX_16 array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).

            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using cudaMallocHost.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    TAU     (output) COMPLEX_16 array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    WORK    (workspace/output) COMPLEX_16 array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

        Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using cudaMallocHost.

    LWORK   (input) INTEGER
            The dimension of the array WORK.  LWORK >= N*NB,
            where NB can be obtained through magma_get_zgeqrf_nb(M).

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  if INFO = -8, the GPU memory allocation failed

    Further Details
    ===============
    The matrix Q is represented as a product of elementary reflectors

       Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).
    =====================================================================    */

    #define  a_ref(a_1,a_2) ( a+(a_2)*(lda) + (a_1))
    #define da_ref(a_1,a_2) (da+(a_2)*ldda  + (a_1))

    int cnt=-1;
    cuDoubleComplex c_one = MAGMA_Z_ONE;

    int i, k, lddwork, old_i, old_ib;
    int nbmin, nx, ib, ldda;

    *info = 0;

    magma_qr_params *qr_params = (magma_qr_params *)cntxt->params;
    int nb = qr_params->nb;

    int lwkopt = n * nb;
    work[0] = MAGMA_Z_MAKE( (double)lwkopt, 0 );
    long int lquery = (lwork == -1);
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    } else if (lwork < max(1,n) && ! lquery) {
        *info = -7;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return MAGMA_ERR_ILLEGAL_VALUE;
    }
    else if (lquery)
      return MAGMA_SUCCESS;

    k = min(m,n);
    if (k == 0) {
        work[0] = c_one;
        return MAGMA_SUCCESS;
    }

    cublasStatus status;
    static cudaStream_t stream[2];
    cudaStreamCreate(&stream[0]);
    cudaStreamCreate(&stream[1]);

    nbmin = 2;
    nx = nb;

    lddwork = ((n+31)/32)*32;
    ldda    = ((m+31)/32)*32;

    cuDoubleComplex *da;
    status = cublasAlloc((n)*ldda + nb*lddwork, sizeof(cuDoubleComplex), (void**)&da);
    if (status != CUBLAS_STATUS_SUCCESS) {
        *info = -8;
        return 0;
    }
    cuDoubleComplex *dwork = da + ldda*(n);

    if (nb >= nbmin && nb < k && nx < k) {
        /* Use blocked code initially */
        cudaMemcpy2DAsync(da_ref(0,nb), ldda*sizeof(cuDoubleComplex),
                           a_ref(0,nb), lda *sizeof(cuDoubleComplex),
                          sizeof(cuDoubleComplex)*(m), (n-nb),
                          cudaMemcpyHostToDevice,stream[0]);

        old_i = 0; old_ib = nb;
        for (i = 0; i < k-nx; i += nb) {
            ib = min(k-i, nb);
            if (i>0){
                cudaMemcpy2DAsync( a_ref(i,i),  lda *sizeof(cuDoubleComplex),
                                   da_ref(i,i), ldda*sizeof(cuDoubleComplex),
                                   sizeof(cuDoubleComplex)*(m-i), ib,
                                   cudaMemcpyDeviceToHost,stream[1]);

                cudaMemcpy2DAsync( a_ref(0,i),  lda *sizeof(cuDoubleComplex),
                                   da_ref(0,i), ldda*sizeof(cuDoubleComplex),
                                   sizeof(cuDoubleComplex)*i, ib,
                                   cudaMemcpyDeviceToHost,stream[0]);

                /* Apply H' to A(i:m,i+2*ib:n) from the left */
                magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, 
                  m-old_i, n-old_i-2*old_ib, old_ib,
                  da_ref(old_i, old_i),          ldda, dwork,        lddwork,
                  da_ref(old_i, old_i+2*old_ib), ldda, dwork+old_ib, lddwork);
            }

            cudaStreamSynchronize(stream[1]);
            int rows = m-i;

        cnt++;
        cntxt->nb = qr_params->ib;
        magma_zgeqrf_mc(cntxt, &rows, &ib, a_ref(i,i), &lda, 
                tau+i, work, &lwork, info);
        cntxt->nb = nb;

            /* Form the triangular factor of the block reflector
               H = H(i) H(i+1) . . . H(i+ib-1) */
            lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, 
                              &rows, &ib, a_ref(i,i), &lda, tau+i, qr_params->t+cnt*nb*nb, &ib);
        if (cnt < qr_params->np_gpu) {
          qr_params->p[cnt]=a;
        }
        zpanel_to_q(MagmaUpper, ib, a_ref(i,i), lda, qr_params->w+cnt*qr_params->nb*qr_params->nb);
            cublasSetMatrix(rows, ib, sizeof(cuDoubleComplex),
                            a_ref(i,i), lda, da_ref(i,i), ldda);
        if (qr_params->flag == 1)
          zq_to_panel(MagmaUpper, ib, a_ref(i,i), lda, qr_params->w+cnt*qr_params->nb*qr_params->nb);
        
            if (i + ib < n) { 
          cublasSetMatrix(ib, ib, sizeof(cuDoubleComplex), qr_params->t+cnt*nb*nb, ib, dwork, lddwork);

          if (i+ib < k-nx)
        /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
        magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, 
                  rows, ib, ib, 
                  da_ref(i, i   ), ldda, dwork,    lddwork, 
                  da_ref(i, i+ib), ldda, dwork+ib, lddwork);
          else
        magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, 
                  rows, n-i-ib, ib, 
                  da_ref(i, i   ), ldda, dwork,    lddwork, 
                  da_ref(i, i+ib), ldda, dwork+ib, lddwork);

          old_i  = i;
          old_ib = ib;
            }
        }
    } else {
      i = 0;
    }
    
    /* Use unblocked code to factor the last or only block. */
    if (i < k) 
      {
    ib = n-i;
    if (i!=0)
      cublasGetMatrix(m, ib, sizeof(cuDoubleComplex),
              da_ref(0,i), ldda, a_ref(0,i), lda);
        int rows = m-i;
    
        cnt++;
        lapackf77_zgeqrf(&rows, &ib, a_ref(i,i), &lda, tau+i, work, &lwork, info);
    
    if (cnt < qr_params->np_gpu) 
      {
        int ib2=min(ib,nb);
        
        lapackf77_zlarft( MagmaForwardStr, MagmaColumnwiseStr, 
                              &rows, &ib2, a_ref(i,i), &lda, tau+i, qr_params->t+cnt*nb*nb, &ib2);
        
        qr_params->p[cnt]=a;
      }
      }
    
    cudaStreamDestroy( stream[0] );
    cudaStreamDestroy( stream[1] );
    cublasFree(da);
    return MAGMA_SUCCESS;
} /* magma_zgeqrf */