Example #1
0
cudaError_t cudaFreeHost(void *ptr)
{
	cudaHostUnregister(ptr);
	VirtioQCArg arg;
	memset(&arg, 0, sizeof(VirtioQCArg));

	ptr(arg.pA, ptr, 0);
	send_cmd_to_device( VIRTQC_cudaFreeHost, &arg);
	return (cudaError_t)arg.cmd;
}
Example #2
0
void dfft_cuda_destroy_plan(dfft_plan plan)
    {
    dfft_destroy_plan_common(plan, 1);

    #ifndef ENABLE_MPI_CUDA
    cudaHostUnregister(plan.h_stage_in);
    cudaHostUnregister(plan.h_stage_out);
    free(plan.h_stage_in);
    free(plan.h_stage_out);
    #endif

    int dmax = plan.max_depth + 2;
    int d;
    for (d = 0; d < dmax; ++d)
        {
        cudaFree(plan.d_rev_j1[d]);
        cudaFree(plan.d_rev_partial[d]);
        cudaFree(plan.d_rev_global[d]);
        cudaFree(plan.d_c0[d]);
        cudaFree(plan.d_c1[d]);
        }
    for (d = 0; d < plan.max_depth; ++d)
        {
        cudaFree(plan.d_alpha[d]);
        free(plan.h_alpha[d]);
        }
    free(plan.d_c0);
    free(plan.d_c1);
    free(plan.d_rev_j1);
    free(plan.d_rev_partial);
    free(plan.d_rev_global);
    if (plan.max_depth)
        {
        free(plan.d_alpha);
        free(plan.h_alpha);
        }

    cudaFree(plan.d_pidx);
    cudaFree(plan.d_pdim);
    cudaFree(plan.d_iembed);
    cudaFree(plan.d_oembed);
    cudaFree(plan.d_length);
    }
Example #3
0
void DataBuffer<DT>::page_unlk ()
{ cuda_check (cudaHostUnregister ( data_.dptr));
  cuda_check (cudaHostUnregister ( pred_.dptr));
  cuda_check (cudaHostUnregister (label_.dptr));
}
Example #4
0
void cblas_sgemm(const enum CBLAS_ORDER Order, 
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, 
                 const int M, const int N, const int K, 
                 const float alpha, const float *A, const int lda, 
                 const float *B, const int ldb,
                 const float beta, float *C, const int ldc )
{
    cublasOperation_t transa, transb;
    cublasStatus_t status;
    /*---error handler---*/
    int nrowa, ncola, nrowb, ncolb;
    if (TransA == CblasNoTrans) {
        nrowa = M;
        ncola = K;
    } else {
        nrowa = K;
        ncola = M;
    }
    if (TransB == CblasNoTrans) {
        nrowb = K;
        ncolb = N;
    } else {
        nrowb = N;
        ncolb = K;
    }
    int nrowc = M;
    int ncolc = N;
    int info = 0;
    if (CBLasTransToCuBlasTrans(TransA,&transa) <  0) info = 1;
    else if (CBLasTransToCuBlasTrans(TransB,&transb) < 0) info = 2;
    else if (M < 0) info = 3;
    else if (N < 0) info = 4;
    else if (K < 0) info = 5;
    else if (lda < MAX(1, nrowa)) info = 8;
    else if (ldb < MAX(1, nrowb)) info = 10;
    else if (ldc < MAX(1, M)) info = 13;
    if (info != 0) {
        xerbla_(ERROR_NAME, &info);
        return;
    }
    /*-------------------*/
    
    /*----dispatcher-----*/
    int type = 0; //1:cpu 2:cublasxt 3:blasx
    if (M <= 0 || N <= 0 || K <= 0) type = 1;
    if (type == 0 && (M > 1000 || N > 1000 || K > 1000)) type = 3;
    else                                                 type = 1;
    //Blasx_Debug_Output("type after dispatcher:%d\n",type);
    /*-------------------*/
    

    switch (type) {
        case 1:
        CPU_BLAS:
            Blasx_Debug_Output("calling cblas_sgemm:");
            if (cpublas_handle == NULL) blasx_init(CPU);
            if (cblas_sgemm_p == NULL) blasx_init_cblas_func(&cblas_sgemm_p, "cblas_sgemm");
            (*cblas_sgemm_p)(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
            break;
        case 2:
            if (cublasXt_handle == NULL) blasx_init(CUBLASXT);
            Blasx_Debug_Output("calling cublasSgemmXt:");
            status = cublasXtSgemm(cublasXt_handle,
                                   transa, transb,
                                   M, N, K,
                                   (float*)&alpha, (float*)A, lda,
                                   (float*)B, ldb,
                                   (float*)&beta, (float*)C, ldc);
            if( status != CUBLAS_STATUS_SUCCESS ) goto CPU_BLAS;
            break;
        case 3:
            Blasx_Debug_Output("calling BLASX:\n");
            cudaHostRegister(A,sizeof(float)*nrowa*ncola,cudaHostRegisterPortable);
            cudaHostRegister(B,sizeof(float)*nrowb*ncolb,cudaHostRegisterPortable);
            cudaHostRegister(C,sizeof(float)*nrowc*ncolc,cudaHostRegisterPortable);
#ifdef BENCHMARK
            double Gflops = FLOPS_DGEMM(M, N, K)/(1000000000);
            double gpu_start, gpu_end;
            gpu_start = get_cur_time();
#endif
            if (is_blasx_enable == 0) blasx_init(BLASX);
            assert( is_blasx_enable == 1 );
            assert( SYS_GPUS > 0 );
            assert( event_SGEMM[0] != NULL );
            assert( C_dev_SGEMM[0] != NULL );
            assert( handles_SGEMM[0] != NULL );
            assert( streams_SGEMM[0] != NULL );
            LRU_t* LRUs[10];
            int GPU_id = 0;
            for (GPU_id = 0; GPU_id < SYS_GPUS; GPU_id++)    LRUs[GPU_id] = LRU_init( GPU_id );
            blasx_sgemm(SYS_GPUS, handles_SGEMM, LRUs,
                        TransA, TransB,
                        M, N, K, alpha,
                        A, lda,
                        B, ldb,
                        beta,
                        C, ldc);
            for (GPU_id = 0; GPU_id < SYS_GPUS; GPU_id++)    LRU_free( LRUs[GPU_id], GPU_id );
#ifdef BENCHMARK
            gpu_end = get_cur_time();
            printf("BLASX (M:%5d,N:%5d,K:%5d) Speed:%9.1f type:%2d\n", M, N, K, (double)Gflops/(gpu_end - gpu_start), type);
#endif
            cudaHostUnregister(A);
            cudaHostUnregister(B);
            cudaHostUnregister(C);
            break;
        default:
            break;
    }

    //Blasx_Debug_Output("eventually use type:%d to compute\n",type);
}
Example #5
0
void
cg_solve(OperatorType& A,
         const VectorType& b,
         VectorType& x,
         Matvec matvec,
         typename OperatorType::LocalOrdinalType max_iter,
         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& tolerance,
         typename OperatorType::LocalOrdinalType& num_iters,
         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& normr,
         timer_type* my_cg_times)
{
  typedef typename OperatorType::ScalarType ScalarType;
  typedef typename OperatorType::GlobalOrdinalType GlobalOrdinalType;
  typedef typename OperatorType::LocalOrdinalType LocalOrdinalType;
  typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type;

  timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0, tMATVECDOT = 0;
  timer_type total_time = mytimer();

  int myproc = 0;
#ifdef HAVE_MPI
  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
#endif

  if (!A.has_local_indices) {
    std::cerr << "miniFE::cg_solve ERROR, A.has_local_indices is false, needs to be true. This probably means "
       << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::cg_solve."
       << std::endl;
    return;
  }

  size_t nrows = A.rows.size();
  LocalOrdinalType ncols = A.num_cols;

  nvtxRangeId_t r1=nvtxRangeStartA("Allocation of Temporary Vectors");
  VectorType r(b.startIndex, nrows);
  VectorType p(0, ncols);
  VectorType Ap(b.startIndex, nrows);
  nvtxRangeEnd(r1);

#ifdef HAVE_MPI
#ifndef GPUDIRECT
  //TODO move outside?
  cudaHostRegister(&p.coefs[0],ncols*sizeof(typename VectorType::ScalarType),0);
  cudaCheckError();
  if(A.send_buffer.size()>0) cudaHostRegister(&A.send_buffer[0],A.send_buffer.size()*sizeof(typename VectorType::ScalarType),0);
  cudaCheckError();
#endif
#endif

  normr = 0;
  magnitude_type rtrans = 0;
  magnitude_type oldrtrans = 0;

  LocalOrdinalType print_freq = max_iter/10;
  if (print_freq>50) print_freq = 50;
  if (print_freq<1)  print_freq = 1;

  ScalarType one = 1.0;
  ScalarType zero = 0.0;

  TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY);

  TICK();
  matvec(A, p, Ap);
  TOCK(tMATVEC);

  TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY);

  TICK(); rtrans = dot(r, r); TOCK(tDOT);

  normr = std::sqrt(rtrans);

  if (myproc == 0) {
    std::cout << "Initial Residual = "<< normr << std::endl;
  }

  magnitude_type brkdown_tol = std::numeric_limits<magnitude_type>::epsilon();

#ifdef MINIFE_DEBUG
  std::ostream& os = outstream();
  os << "brkdown_tol = " << brkdown_tol << std::endl;
#endif

  for(LocalOrdinalType k=1; k <= max_iter && normr > tolerance; ++k) {
    if (k == 1) {
      TICK(); waxpby(one, r, zero, r, p); TOCK(tWAXPY);
    }
    else {
      oldrtrans = rtrans;
      TICK(); rtrans = dot(r, r); TOCK(tDOT);
      magnitude_type beta = rtrans/oldrtrans;
      TICK(); waxpby(one, r, beta, p, p); TOCK(tWAXPY);
    }

    normr = std::sqrt(rtrans);

    if (myproc == 0 && (k%print_freq==0 || k==max_iter)) {
      std::cout << "Iteration = "<<k<<"   Residual = "<<normr<<std::endl;
    }

    magnitude_type alpha = 0;
    magnitude_type p_ap_dot = 0;

    TICK(); matvec(A, p, Ap); TOCK(tMATVEC);

    TICK(); p_ap_dot = dot(Ap, p); TOCK(tDOT);

#ifdef MINIFE_DEBUG
    os << "iter " << k << ", p_ap_dot = " << p_ap_dot;
    os.flush();
#endif
    //TODO remove false below
    if (false && p_ap_dot < brkdown_tol) {
      if (p_ap_dot < 0 || breakdown(p_ap_dot, Ap, p)) {
        std::cerr << "miniFE::cg_solve ERROR, numerical breakdown!"<<std::endl;
#ifdef MINIFE_DEBUG
        os << "ERROR, numerical breakdown!"<<std::endl;
#endif
        //update the timers before jumping out.
        my_cg_times[WAXPY] = tWAXPY;
        my_cg_times[DOT] = tDOT;
        my_cg_times[MATVEC] = tMATVEC;
        my_cg_times[TOTAL] = mytimer() - total_time;
        return;
      }
      else brkdown_tol = 0.1 * p_ap_dot;
    }
    alpha = rtrans/p_ap_dot;
#ifdef MINIFE_DEBUG
    os << ", rtrans = " << rtrans << ", alpha = " << alpha << std::endl;
#endif

    TICK(); waxpby(one, x, alpha, p, x);
            waxpby(one, r, -alpha, Ap, r); TOCK(tWAXPY);
    num_iters = k;
  }
  
#ifdef HAVE_MPI
#ifndef GPUDIRECT
  //TODO move outside?
  cudaHostUnregister(&p.coefs[0]);
  cudaCheckError();
  if(A.send_buffer.size()>0) cudaHostUnregister(&A.send_buffer[0]);
  cudaCheckError();
#endif
#endif

  my_cg_times[WAXPY] = tWAXPY;
  my_cg_times[DOT] = tDOT;
  my_cg_times[MATVEC] = tMATVEC;
  my_cg_times[MATVECDOT] = tMATVECDOT;
  my_cg_times[TOTAL] = mytimer() - total_time;
}
Example #6
0
static int
RunTest(int *iparam, double *dparam, real_Double_t *t_) 
{
    double *A = NULL, *AT, *b, *bT, *x;
    PLASMA_desc        *descA, *descB, *descL;
    real_Double_t       t;
    int                *piv;
    int nb, nb2, nt;
    int n     = iparam[TIMING_N];
    int nrhs  = iparam[TIMING_NRHS];
    int check = iparam[TIMING_CHECK];
    int lda      = n;
    int ldb      = n;

    /* Initialize Plasma */ 
    PLASMA_Init( iparam[TIMING_THRDNBR] );
    if ( iparam[TIMING_SCHEDULER] )
        PLASMA_Set(PLASMA_SCHEDULING_MODE, PLASMA_DYNAMIC_SCHEDULING );
    else
        PLASMA_Set(PLASMA_SCHEDULING_MODE, PLASMA_STATIC_SCHEDULING );

#if defined(PLASMA_CUDA)
    core_cublas_init();
#endif

    /*if ( !iparam[TIMING_AUTOTUNING] ) {*/
        PLASMA_Disable(PLASMA_AUTOTUNING);
        PLASMA_Set(PLASMA_TILE_SIZE,        iparam[TIMING_NB] );
        PLASMA_Set(PLASMA_INNER_BLOCK_SIZE, iparam[TIMING_IB] );
    /* } else { */
    /*     PLASMA_Get(PLASMA_TILE_SIZE,        &iparam[TIMING_NB] ); */
    /*     PLASMA_Get(PLASMA_INNER_BLOCK_SIZE, &iparam[TIMING_IB] ); */
    /* }  */
    nb  = iparam[TIMING_NB];
    nb2 = nb * nb;
    nt  = n / nb + ((n % nb == 0) ? 0 : 1);
    
    /* Allocate Data */
    AT  = (double *)malloc(nt*nt*nb2*sizeof(double));

    /* Check if unable to allocate memory */
    if ( !AT ){
        printf("Out of Memory \n ");
        exit(0);
    }

#if defined(PLASMA_CUDA)
    cudaHostRegister(AT, nt*nt*nb2*sizeof(double), cudaHostRegisterPortable);
#endif

    /* Initialiaze Data */
    PLASMA_Desc_Create(&descA, AT, PlasmaRealDouble, nb, nb, nb*nb, n, n, 0, 0, n, n);
    LAPACKE_dlarnv_work(1, ISEED, nt*nt*nb2, AT);

    /* Allocate Workspace */
    PLASMA_Alloc_Workspace_dgesv_incpiv_Tile(n, &descL, &piv);

    {
      int NB, MT, NT;
      size_t size;
      NB = nb;
      NT = (n%NB==0) ? (n/NB) : ((n/NB)+1);
      MT = (n%NB==0) ? (n/NB) : ((n/NB)+1);
      size = (size_t)MT*NT*NB * sizeof(int);
#if defined(PLASMA_CUDA)
      cudaHostRegister((void*)piv, size, cudaHostRegisterPortable);
#endif
    }
#if defined(PLASMA_CUDA)
    cudaHostRegister((void*)descL->mat, descL->lm*descL->ln*sizeof(double), cudaHostRegisterPortable);
#endif

    /* Save AT in lapack layout for check */
    if ( check ) {
        A = (double *)malloc(lda*n    *sizeof(double));
        PLASMA_Tile_to_Lapack(descA, (void*)A, n);
    }

    t = -cWtime();
    PLASMA_dgetrf_incpiv_Tile( descA, descL, piv );
    t += cWtime();
    *t_ = t;
    
    /* Check the solution */
    if ( check )
      {
        b  = (double *)malloc(ldb*nrhs *sizeof(double));
        bT = (double *)malloc(nt*nb2   *sizeof(double));
        x  = (double *)malloc(ldb*nrhs *sizeof(double));

        LAPACKE_dlarnv_work(1, ISEED, n*nrhs, b);
        PLASMA_Desc_Create(&descB, bT, PlasmaRealDouble, nb, nb, nb*nb, n, nrhs, 0, 0, n, nrhs);
        PLASMA_Lapack_to_Tile((void*)b, n, descB);

        PLASMA_dgetrs_incpiv_Tile( descA, descL, piv, descB );

        PLASMA_Tile_to_Lapack(descB, (void*)x, n);

        dparam[TIMING_RES] = d_check_solution(n, n, nrhs, A, lda, b, x, ldb,
                                             &(dparam[TIMING_ANORM]), &(dparam[TIMING_BNORM]), 
                                             &(dparam[TIMING_XNORM]));

        PLASMA_Desc_Destroy(&descB);
        free( A ); free( b ); free( bT ); free( x );
      }

    /* Deallocate Workspace */
    PLASMA_Dealloc_Handle_Tile(&descL);

    PLASMA_Desc_Destroy(&descA);

    PLASMA_Finalize();
#if defined(PLASMA_CUDA)
    cudaHostUnregister(AT);
    cudaHostUnregister(piv);
#endif
    free( AT );
    free( piv );

    return 0;
}