Example #1
0
cudaError_t cudaHostAlloc(void **pHost, size_t size, unsigned int flags)
{
	void *a_UA = malloc(size + MEMORY_ALIGNMENT);
	*pHost = (void *) ALIGN_UP(a_UA, MEMORY_ALIGNMENT);

	return cudaHostRegister(*pHost, size, flags);
}
void setTransposeCudaMpi(domain_t domain){


  CHECK_CUBLAS( cublasCreate(&cublasHandle) );
  
  alpha[0].x=1.0f;
  alpha[0].y=0.0f;
  
  size=NXSIZE*NY*NZ*sizeof(float2);
  
  aux_host1=(float2*)malloc(size);
  aux_host2=(float2*)malloc(size);	

  CHECK_CUDART( cudaHostRegister(aux_host1,size,0) );
  CHECK_CUDART( cudaHostRegister(aux_host2,size,0) );
  
  return;
}
Example #3
0
static int
RunTest(int *iparam, float *dparam, real_Double_t *t_) 
{
    float *AT, *BT, *CT;
    float *A = NULL, *B = NULL, *C1 = NULL, *C2 = NULL;
    float alpha, beta;
    PLASMA_desc        *descA, *descB, *descC;
    real_Double_t       t;
    int nb, nb2, nt;
    int n       = iparam[TIMING_N];
    int check   = iparam[TIMING_CHECK];
    int lda     = n;
    
    /* Allocate Data */
    /* Initialize Plasma */ 
    PLASMA_Init( iparam[TIMING_THRDNBR] );
    if ( iparam[TIMING_SCHEDULER] )
        PLASMA_Set(PLASMA_SCHEDULING_MODE, PLASMA_DYNAMIC_SCHEDULING );
    else
        PLASMA_Set(PLASMA_SCHEDULING_MODE, PLASMA_STATIC_SCHEDULING );

    /*if ( !iparam[TIMING_AUTOTUNING] ) {*/
        PLASMA_Disable(PLASMA_AUTOTUNING);
        PLASMA_Set(PLASMA_TILE_SIZE,        iparam[TIMING_NB] );
    /* } */
    /* } else { */
    /*     PLASMA_Get(PLASMA_TILE_SIZE,        &iparam[TIMING_NB] ); */
    /* }  */
    nb  = iparam[TIMING_NB];
    nb2 = nb * nb;
    nt  = n / nb + ((n % nb == 0) ? 0 : 1);

    AT = (float *)malloc(nt*nt*nb2*sizeof(float));
    BT = (float *)malloc(nt*nt*nb2*sizeof(float));
    CT = (float *)malloc(nt*nt*nb2*sizeof(float));

    /* Check if unable to allocate memory */
    if ( (!AT) || (!BT) || (!CT) ) {
        printf("Out of Memory \n ");
        exit(0);
    }
  
#if defined(PLASMA_CUDA)
    cudaHostRegister(AT, nt*nt*nb2*sizeof(float), cudaHostRegisterPortable);
    cudaHostRegister(BT, nt*nt*nb2*sizeof(float), cudaHostRegisterPortable);
    cudaHostRegister(CT, nt*nt*nb2*sizeof(float), cudaHostRegisterPortable);
#endif

     /* Initialiaze Data */
    LAPACKE_slarnv_work(1, ISEED, 1, &alpha);
    LAPACKE_slarnv_work(1, ISEED, 1, &beta);
    LAPACKE_slarnv_work(1, ISEED, nt*nt*nb2, AT);
    LAPACKE_slarnv_work(1, ISEED, nt*nt*nb2, BT);
    LAPACKE_slarnv_work(1, ISEED, nt*nt*nb2, CT);

    /* Initialize AT and bT for Symmetric Positif Matrix */
    PLASMA_Desc_Create(&descA, AT, PlasmaRealFloat, nb, nb, nb*nb, n, n, 0, 0, n, n);
    PLASMA_Desc_Create(&descB, BT, PlasmaRealFloat, nb, nb, nb*nb, n, n, 0, 0, n, n);
    PLASMA_Desc_Create(&descC, CT, PlasmaRealFloat, nb, nb, nb*nb, n, n, 0, 0, n, n);

    if (check)
      {
          C2 = (float *)malloc(n*lda*sizeof(float));
          PLASMA_Tile_to_Lapack(descC, (void*)C2, n);
      }

#if defined(PLASMA_CUDA)
    core_cublas_init();
#endif

    t = -cWtime();
    PLASMA_sgemm_Tile( PlasmaNoTrans, PlasmaNoTrans, alpha, descA, descB, beta, descC );
    t += cWtime();
    *t_ = t;
    
    /* Check the solution */
    if (check)
      {
          A = (float *)malloc(n*lda*sizeof(float));
          PLASMA_Tile_to_Lapack(descA, (void*)A, n);
          free(AT);

          B = (float *)malloc(n*lda*sizeof(float));
          PLASMA_Tile_to_Lapack(descB, (void*)B, n);
          free(BT);

          C1 = (float *)malloc(n*lda*sizeof(float));
          PLASMA_Tile_to_Lapack(descC, (void*)C1, n);
          free(CT);

          dparam[TIMING_RES] = s_check_gemm( PlasmaNoTrans, PlasmaNoTrans, n, n, n, 
                                            alpha, A, lda, B, lda, beta, C1, C2, lda,
                                            &(dparam[TIMING_ANORM]), &(dparam[TIMING_BNORM]), 
                                            &(dparam[TIMING_XNORM]));
          free(C2);
      }
    else {
        free( AT );
        free( BT );
        free( CT );
    }

    PLASMA_Desc_Destroy(&descA);
    PLASMA_Desc_Destroy(&descB);
    PLASMA_Desc_Destroy(&descC);
    PLASMA_Finalize();

    return 0;
}
Example #4
0
static int
RunTest(int *iparam, double *dparam, real_Double_t *t_) 
{
    double *A = NULL, *AT, *b = NULL, *bT, *x;
    PLASMA_desc        *descA, *descB, *descT;
    real_Double_t       t;
    int nb, nb2, nt;
    int n     = iparam[TIMING_N];
    int nrhs  = iparam[TIMING_NRHS];
    int check = iparam[TIMING_CHECK];
    int lda = n;
    int ldb = n;

    /* Initialize Plasma */ 
    PLASMA_Init( iparam[TIMING_THRDNBR] );
    if ( iparam[TIMING_SCHEDULER] )
        PLASMA_Set(PLASMA_SCHEDULING_MODE, PLASMA_DYNAMIC_SCHEDULING );
    else
        PLASMA_Set(PLASMA_SCHEDULING_MODE, PLASMA_STATIC_SCHEDULING );
  
#if defined(PLASMA_CUDA)
  core_cublas_init();
#endif

    /*if ( !iparam[TIMING_AUTOTUNING] ) {*/
        PLASMA_Disable(PLASMA_AUTOTUNING);
        PLASMA_Set(PLASMA_TILE_SIZE,        iparam[TIMING_NB] );
        PLASMA_Set(PLASMA_INNER_BLOCK_SIZE, iparam[TIMING_IB] );
    /* } else { */
    /*     PLASMA_Get(PLASMA_TILE_SIZE,        &iparam[TIMING_NB] ); */
    /*     PLASMA_Get(PLASMA_INNER_BLOCK_SIZE, &iparam[TIMING_IB] ); */
    /* }  */
    nb  = iparam[TIMING_NB];
    nb2 = nb * nb;
    nt  = n / nb + ((n % nb == 0) ? 0 : 1);
    
    /* Allocate Data */
    AT  = (double *)malloc(nt*nt*nb2*sizeof(double));

    /* Check if unable to allocate memory */
    if ( !AT ){
        printf("Out of Memory \n ");
        exit(0);
    }
  
#if defined(PLASMA_CUDA)
    cudaHostRegister((void*)AT, nt*nt*nb2*sizeof(double), cudaHostRegisterPortable);
#endif

    /* Initialiaze Data */
    PLASMA_Desc_Create(&descA, AT, PlasmaRealDouble, nb, nb, nb*nb, n, n, 0, 0, n, n);
    LAPACKE_dlarnv_work(1, ISEED, nt*nt*nb2, AT);

    /* Allocate Workspace */
    PLASMA_Alloc_Workspace_dgels_Tile(n, n, &descT);
  
#if defined(PLASMA_CUDA)
    cudaHostRegister((void*)descT->mat, descT->lm*descT->ln*sizeof(double), cudaHostRegisterPortable);
#endif

    /* Save AT in lapack layout for check */
    if ( check ) {
        A = (double *)malloc(lda*n    *sizeof(double));
        PLASMA_Tile_to_Lapack(descA, (void*)A, n);
    }

    t = -cWtime();
    PLASMA_dgeqrf_Tile( descA, descT );
    t += cWtime();
    *t_ = t;
    
    /* Check the solution */
    if ( check )
      {
        b  = (double *)malloc(ldb*nrhs *sizeof(double));
        bT = (double *)malloc(nt*nb2   *sizeof(double));
        x  = (double *)malloc(ldb*nrhs *sizeof(double));

        LAPACKE_dlarnv_work(1, ISEED, nt*nb2, bT);
        PLASMA_Desc_Create(&descB, bT, PlasmaRealDouble, nb, nb, nb*nb, n, nrhs, 0, 0, n, nrhs);
        PLASMA_Tile_to_Lapack(descB, (void*)b, n);

        PLASMA_dgeqrs_Tile( descA, descT, descB );

        PLASMA_Tile_to_Lapack(descB, (void*)x, n);

        dparam[TIMING_RES] = d_check_solution(n, n, nrhs, A, lda, b, x, ldb,
                                             &(dparam[TIMING_ANORM]), &(dparam[TIMING_BNORM]), 
                                             &(dparam[TIMING_XNORM]));

        PLASMA_Desc_Destroy(&descB);
        free( A ); 
        free( b ); 
        free( bT ); 
        free( x );
      }

    /* Allocate Workspace */
    PLASMA_Dealloc_Handle_Tile(&descT);

    PLASMA_Desc_Destroy(&descA);

    free( AT );
    PLASMA_Finalize();
#if defined(PLASMA_CUDA)
#endif
    return 0;
}
Example #5
0
void DataBuffer<DT>::page_lock ()
{ cuda_check (cudaHostRegister ( data_.dptr,  data_.size_d(), cudaHostRegisterPortable));
  cuda_check (cudaHostRegister ( pred_.dptr,  pred_.size_d(), cudaHostRegisterPortable));
  cuda_check (cudaHostRegister (label_.dptr, label_.size_d(), cudaHostRegisterPortable));
}
Example #6
0
/// The force exchange is considerably simpler than the atom exchange.
/// In the force case we only need to exchange data that is needed to
/// complete the force calculation.  Since the atoms have not moved we
/// only need to send data from local link cells and we are guaranteed
/// that the same atoms exist in the same order in corresponding halo
/// cells on remote tasks.  The only tricky part is the size of the
/// plane of local cells that needs to be sent grows in each direction.
/// This is because the y-axis send must send some of the data that was
/// received from the x-axis send, and the z-axis must send some data
/// from the y-axis send.  This accumulation of data to send is
/// responsible for data reaching neighbor cells that share only edges
/// or corners.
///
/// \see eam.c for an explanation of the requirement to exchange
/// force data.
HaloExchange* initForceHaloExchange(Domain* domain, LinkCell* boxes, int useGPU)
{
   HaloExchange* hh = initHaloExchange(domain);

   if(useGPU){
      hh->loadBuffer = loadForceBuffer;
      hh->unloadBuffer = unloadForceBuffer;
   }else{
      hh->loadBuffer = loadForceBufferCpu;
      hh->unloadBuffer = unloadForceBufferCpu;
   }
   hh->destroy = destroyForceExchange;

   int size0 = (boxes->gridSize[1])*(boxes->gridSize[2]);
   int size1 = (boxes->gridSize[0]+2)*(boxes->gridSize[2]);
   int size2 = (boxes->gridSize[0]+2)*(boxes->gridSize[1]+2);
   int maxSize = MAX(size0, size1);
   maxSize = MAX(size1, size2);
   hh->bufCapacity = (maxSize)*MAXATOMS*sizeof(ForceMsg);

   hh->sendBufM = (char*)comdMalloc(hh->bufCapacity);
   hh->sendBufP = (char*)comdMalloc(hh->bufCapacity);
   hh->recvBufP = (char*)comdMalloc(hh->bufCapacity);
   hh->recvBufM = (char*)comdMalloc(hh->bufCapacity);

   // pin memory
   cudaHostRegister(hh->sendBufM, hh->bufCapacity, 0);
   cudaHostRegister(hh->sendBufP, hh->bufCapacity, 0);
   cudaHostRegister(hh->recvBufP, hh->bufCapacity, 0);
   cudaHostRegister(hh->recvBufM, hh->bufCapacity, 0);

   ForceExchangeParms* parms = (ForceExchangeParms*)comdMalloc(sizeof(ForceExchangeParms));

   parms->nCells[HALO_X_MINUS] = (boxes->gridSize[1]  )*(boxes->gridSize[2]  );
   parms->nCells[HALO_Y_MINUS] = (boxes->gridSize[0]+2)*(boxes->gridSize[2]  );
   parms->nCells[HALO_Z_MINUS] = (boxes->gridSize[0]+2)*(boxes->gridSize[1]+2);
   parms->nCells[HALO_X_PLUS]  = parms->nCells[HALO_X_MINUS];
   parms->nCells[HALO_Y_PLUS]  = parms->nCells[HALO_Y_MINUS];
   parms->nCells[HALO_Z_PLUS]  = parms->nCells[HALO_Z_MINUS];

   for (int ii=0; ii<6; ++ii)
   {
      parms->sendCells[ii] = mkForceSendCellList(boxes, ii, parms->nCells[ii]);
      parms->recvCells[ii] = mkForceRecvCellList(boxes, ii, parms->nCells[ii]);

      // copy cell list to gpu
      cudaMalloc((void**)&parms->sendCellsGpu[ii], parms->nCells[ii] * sizeof(int));
      cudaMalloc((void**)&parms->recvCellsGpu[ii], parms->nCells[ii] * sizeof(int));
      cudaMemcpy(parms->sendCellsGpu[ii], parms->sendCells[ii], parms->nCells[ii] * sizeof(int), cudaMemcpyHostToDevice);
      cudaMemcpy(parms->recvCellsGpu[ii], parms->recvCells[ii], parms->nCells[ii] * sizeof(int), cudaMemcpyHostToDevice);

      // allocate temp buf
      int size = parms->nCells[ii]+1;
      if (size % 256 != 0) size = ((size + 255)/256)*256;
      cudaMalloc((void**)&parms->natoms_buf[ii], size * sizeof(int));
      cudaMalloc((void**)&parms->partial_sums[ii], (size/256 + 1) * sizeof(int));
   }
   
   hh->hashTable = NULL;
   hh->type = 1;
   hh->parms = parms;
   return hh;
}
Example #7
0
/// \details
/// When called in proper sequence by redistributeAtoms, the atom halo
/// exchange helps serve three purposes:
/// - Send ghost atom data to neighbor tasks.
/// - Shift atom coordinates by the global simulation size when they cross
///   periodic boundaries.  This shift is performed in loadAtomsBuffer.
/// - Transfer ownership of atoms between tasks as the atoms move across
///   spatial domain boundaries.  This transfer of ownership occurs in
///   two places.  The former owner gives up ownership when
///   updateLinkCells moves a formerly local atom into a halo link cell.
///   The new owner accepts ownership when unloadAtomsBuffer calls
///   putAtomInBox to place a received atom into a local link cell.
///
/// This constructor does the following:
///
/// - Sets the bufCapacity to hold the largest possible number of atoms
///   that can be sent across a face.
/// - Initialize function pointers to the atom-specific versions
/// - Sets the number of link cells to send across each face.
/// - Builds the list of link cells to send across each face.  As
///   explained in the comments for mkAtomCellList, this list must
///   include any link cell, local or halo, that could possibly contain
///   an atom that needs to be sent across the face.  Atoms that need to
///   be sent include "ghost atoms" that are located in local link
///   cells that correspond to halo link cells on receiving tasks as well as
///   formerly local atoms that have just moved into halo link cells and
///   need to be sent to the rank that owns the spatial domain the atom
///   has moved into.
/// - Sets a coordinate shift factor for each face to account for
///   periodic boundary conditions.  For most faces the factor is zero.
///   For faces on the +x, +y, or +z face of the simulation domain
///   the factor is -1.0 (to shift the coordinates by -1 times the
///   simulation domain size).  For -x, -y, and -z faces of the
///   simulation domain, the factor is +1.0.
///
/// \see redistributeAtoms
HaloExchange* initAtomHaloExchange(Domain* domain, LinkCell* boxes)
{
   HaloExchange* hh = initHaloExchange(domain);
   
   int size0 = (boxes->gridSize[1]+2)*(boxes->gridSize[2]+2);
   int size1 = (boxes->gridSize[0]+2)*(boxes->gridSize[2]+2);
   int size2 = (boxes->gridSize[0]+2)*(boxes->gridSize[1]+2);
   int maxSize = MAX(size0, size1);
   maxSize = MAX(size1, size2);
   hh->bufCapacity = maxSize*2*MAXATOMS*sizeof(AtomMsg);
   
   hh->sendBufM = (char*)comdMalloc(hh->bufCapacity);
   hh->sendBufP = (char*)comdMalloc(hh->bufCapacity);
   hh->recvBufP = (char*)comdMalloc(hh->bufCapacity);
   hh->recvBufM = (char*)comdMalloc(hh->bufCapacity);

   // pin memory
   cudaHostRegister(hh->sendBufM, hh->bufCapacity, 0);
   cudaHostRegister(hh->sendBufP, hh->bufCapacity, 0);
   cudaHostRegister(hh->recvBufP, hh->bufCapacity, 0);
   cudaHostRegister(hh->recvBufM, hh->bufCapacity, 0);

   hh->loadBuffer = loadAtomsBuffer;
   hh->unloadBuffer = unloadAtomsBuffer;
   hh->destroy = destroyAtomsExchange;

   hh->hashTable = initHashTable((boxes->nTotalBoxes - boxes->nLocalBoxes) * MAXATOMS * 2);

   AtomExchangeParms* parms = (AtomExchangeParms*)comdMalloc(sizeof(AtomExchangeParms));

   parms->nCells[HALO_X_MINUS] = 2*(boxes->gridSize[1]+2)*(boxes->gridSize[2]+2);
   parms->nCells[HALO_Y_MINUS] = 2*(boxes->gridSize[0]+2)*(boxes->gridSize[2]+2);
   parms->nCells[HALO_Z_MINUS] = 2*(boxes->gridSize[0]+2)*(boxes->gridSize[1]+2);
   parms->nCells[HALO_X_PLUS]  = parms->nCells[HALO_X_MINUS];
   parms->nCells[HALO_Y_PLUS]  = parms->nCells[HALO_Y_MINUS];
   parms->nCells[HALO_Z_PLUS]  = parms->nCells[HALO_Z_MINUS];

   for (int ii=0; ii<6; ++ii) {
      parms->cellList[ii] = mkAtomCellList(boxes, (enum HaloFaceOrder)ii, parms->nCells[ii]);
	  
      // copy cell list to gpu
      cudaMalloc((void**)&parms->cellListGpu[ii], parms->nCells[ii] * sizeof(int));
      cudaMemcpy(parms->cellListGpu[ii], parms->cellList[ii], parms->nCells[ii] * sizeof(int), cudaMemcpyHostToDevice);
  
   }
   // allocate scan buf
   int size = boxes->nLocalBoxes+1;
   if (size % 256 != 0) size = ((size + 255)/256)*256;

   int partial_size = size/256 + 1;
   if (partial_size % 256 != 0) partial_size = ((partial_size + 255)/256)*256;

   cudaMalloc((void**)&parms->d_natoms_buf, size * sizeof(int));
   parms->h_natoms_buf = (int*) malloc( size * sizeof(int));
   cudaMalloc((void**)&parms->d_partial_sums, partial_size * sizeof(int));

   for (int ii=0; ii<6; ++ii)
   {
      parms->pbcFactor[ii] = (real_t*)comdMalloc(3*sizeof(real_t));
      for (int jj=0; jj<3; ++jj)
         parms->pbcFactor[ii][jj] = 0.0;
   }
   int* procCoord = domain->procCoord; //alias
   int* procGrid  = domain->procGrid; //alias
   if (procCoord[HALO_X_AXIS] == 0)                       parms->pbcFactor[HALO_X_MINUS][HALO_X_AXIS] = +1.0;
   if (procCoord[HALO_X_AXIS] == procGrid[HALO_X_AXIS]-1) parms->pbcFactor[HALO_X_PLUS][HALO_X_AXIS]  = -1.0;
   if (procCoord[HALO_Y_AXIS] == 0)                       parms->pbcFactor[HALO_Y_MINUS][HALO_Y_AXIS] = +1.0;
   if (procCoord[HALO_Y_AXIS] == procGrid[HALO_Y_AXIS]-1) parms->pbcFactor[HALO_Y_PLUS][HALO_Y_AXIS]  = -1.0;
   if (procCoord[HALO_Z_AXIS] == 0)                       parms->pbcFactor[HALO_Z_MINUS][HALO_Z_AXIS] = +1.0;
   if (procCoord[HALO_Z_AXIS] == procGrid[HALO_Z_AXIS]-1) parms->pbcFactor[HALO_Z_PLUS][HALO_Z_AXIS]  = -1.0;
   
   hh->type = 0;
   hh->parms = parms;
   return hh;
}
Example #8
0
void cblas_sgemm(const enum CBLAS_ORDER Order, 
                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, 
                 const int M, const int N, const int K, 
                 const float alpha, const float *A, const int lda, 
                 const float *B, const int ldb,
                 const float beta, float *C, const int ldc )
{
    cublasOperation_t transa, transb;
    cublasStatus_t status;
    /*---error handler---*/
    int nrowa, ncola, nrowb, ncolb;
    if (TransA == CblasNoTrans) {
        nrowa = M;
        ncola = K;
    } else {
        nrowa = K;
        ncola = M;
    }
    if (TransB == CblasNoTrans) {
        nrowb = K;
        ncolb = N;
    } else {
        nrowb = N;
        ncolb = K;
    }
    int nrowc = M;
    int ncolc = N;
    int info = 0;
    if (CBLasTransToCuBlasTrans(TransA,&transa) <  0) info = 1;
    else if (CBLasTransToCuBlasTrans(TransB,&transb) < 0) info = 2;
    else if (M < 0) info = 3;
    else if (N < 0) info = 4;
    else if (K < 0) info = 5;
    else if (lda < MAX(1, nrowa)) info = 8;
    else if (ldb < MAX(1, nrowb)) info = 10;
    else if (ldc < MAX(1, M)) info = 13;
    if (info != 0) {
        xerbla_(ERROR_NAME, &info);
        return;
    }
    /*-------------------*/
    
    /*----dispatcher-----*/
    int type = 0; //1:cpu 2:cublasxt 3:blasx
    if (M <= 0 || N <= 0 || K <= 0) type = 1;
    if (type == 0 && (M > 1000 || N > 1000 || K > 1000)) type = 3;
    else                                                 type = 1;
    //Blasx_Debug_Output("type after dispatcher:%d\n",type);
    /*-------------------*/
    

    switch (type) {
        case 1:
        CPU_BLAS:
            Blasx_Debug_Output("calling cblas_sgemm:");
            if (cpublas_handle == NULL) blasx_init(CPU);
            if (cblas_sgemm_p == NULL) blasx_init_cblas_func(&cblas_sgemm_p, "cblas_sgemm");
            (*cblas_sgemm_p)(Order,TransA,TransB,M,N,K,alpha,A,lda,B,ldb,beta,C,ldc);
            break;
        case 2:
            if (cublasXt_handle == NULL) blasx_init(CUBLASXT);
            Blasx_Debug_Output("calling cublasSgemmXt:");
            status = cublasXtSgemm(cublasXt_handle,
                                   transa, transb,
                                   M, N, K,
                                   (float*)&alpha, (float*)A, lda,
                                   (float*)B, ldb,
                                   (float*)&beta, (float*)C, ldc);
            if( status != CUBLAS_STATUS_SUCCESS ) goto CPU_BLAS;
            break;
        case 3:
            Blasx_Debug_Output("calling BLASX:\n");
            cudaHostRegister(A,sizeof(float)*nrowa*ncola,cudaHostRegisterPortable);
            cudaHostRegister(B,sizeof(float)*nrowb*ncolb,cudaHostRegisterPortable);
            cudaHostRegister(C,sizeof(float)*nrowc*ncolc,cudaHostRegisterPortable);
#ifdef BENCHMARK
            double Gflops = FLOPS_DGEMM(M, N, K)/(1000000000);
            double gpu_start, gpu_end;
            gpu_start = get_cur_time();
#endif
            if (is_blasx_enable == 0) blasx_init(BLASX);
            assert( is_blasx_enable == 1 );
            assert( SYS_GPUS > 0 );
            assert( event_SGEMM[0] != NULL );
            assert( C_dev_SGEMM[0] != NULL );
            assert( handles_SGEMM[0] != NULL );
            assert( streams_SGEMM[0] != NULL );
            LRU_t* LRUs[10];
            int GPU_id = 0;
            for (GPU_id = 0; GPU_id < SYS_GPUS; GPU_id++)    LRUs[GPU_id] = LRU_init( GPU_id );
            blasx_sgemm(SYS_GPUS, handles_SGEMM, LRUs,
                        TransA, TransB,
                        M, N, K, alpha,
                        A, lda,
                        B, ldb,
                        beta,
                        C, ldc);
            for (GPU_id = 0; GPU_id < SYS_GPUS; GPU_id++)    LRU_free( LRUs[GPU_id], GPU_id );
#ifdef BENCHMARK
            gpu_end = get_cur_time();
            printf("BLASX (M:%5d,N:%5d,K:%5d) Speed:%9.1f type:%2d\n", M, N, K, (double)Gflops/(gpu_end - gpu_start), type);
#endif
            cudaHostUnregister(A);
            cudaHostUnregister(B);
            cudaHostUnregister(C);
            break;
        default:
            break;
    }

    //Blasx_Debug_Output("eventually use type:%d to compute\n",type);
}
Example #9
0
int dfft_cuda_create_plan(dfft_plan *p,
    int ndim, int *gdim,
    int *inembed, int *oembed,
    int *pdim, int *pidx, int row_m,
    int input_cyclic, int output_cyclic,
    MPI_Comm comm,
    int *proc_map)
    {
    int res = dfft_create_plan_common(p, ndim, gdim, inembed, oembed,
        pdim, pidx, row_m, input_cyclic, output_cyclic, comm, proc_map, 1);

    #ifndef ENABLE_MPI_CUDA
    /* allocate staging bufs */
    /* we need to use posix_memalign/cudaHostRegister instead
     * of cudaHostAlloc, because cudaHostAlloc doesn't have hooks
     * in the MPI library, and using it would lead to data corruption
     */
    int size = p->scratch_size*sizeof(cuda_cpx_t);
    int page_size = getpagesize();
    size = ((size + page_size - 1) / page_size) * page_size;
    posix_memalign((void **)&(p->h_stage_in),page_size,size);
    posix_memalign((void **)&(p->h_stage_out),page_size,size);
    cudaHostRegister(p->h_stage_in, size, cudaHostAllocDefault);
    CHECK_CUDA();
    cudaHostRegister(p->h_stage_out, size, cudaHostAllocDefault);
    CHECK_CUDA();
    #endif

    /* allocate memory for passing variables */
   cudaMalloc((void **)&(p->d_pidx), sizeof(int)*ndim);
    CHECK_CUDA();
    cudaMalloc((void **)&(p->d_pdim), sizeof(int)*ndim);
    CHECK_CUDA();
    cudaMalloc((void **)&(p->d_iembed), sizeof(int)*ndim);
    CHECK_CUDA();
    cudaMalloc((void **)&(p->d_oembed), sizeof(int)*ndim);
    CHECK_CUDA();
    cudaMalloc((void **)&(p->d_length), sizeof(int)*ndim);
    CHECK_CUDA();

    /* initialize cuda buffers */
    int *h_length = (int *)malloc(sizeof(int)*ndim);
    int i;
    for (i = 0; i < ndim; ++i)
        h_length[i] = gdim[i]/pdim[i];
    cudaMemcpy(p->d_pidx, pidx, sizeof(int)*ndim, cudaMemcpyDefault); 
    CHECK_CUDA();
    cudaMemcpy(p->d_pdim, pdim, sizeof(int)*ndim, cudaMemcpyDefault); 
    CHECK_CUDA();
    cudaMemcpy(p->d_iembed, p->inembed, sizeof(int)*ndim, cudaMemcpyDefault); 
    CHECK_CUDA();
    cudaMemcpy(p->d_oembed, p->oembed, sizeof(int)*ndim, cudaMemcpyDefault); 
    CHECK_CUDA();
    cudaMemcpy(p->d_length, h_length, sizeof(int)*ndim, cudaMemcpyDefault); 
    CHECK_CUDA();
    free(h_length);

    int dmax = p->max_depth + 2;
    p->d_rev_j1 = (int **) malloc(sizeof(int *)*dmax);
    p->d_rev_global = (int **) malloc(sizeof(int *)*dmax);
    p->d_rev_partial = (int **) malloc(sizeof(int *)*dmax);
    p->d_c0 = (int **) malloc(sizeof(int *)*dmax);
    p->d_c1 = (int **) malloc(sizeof(int *)*dmax);
    if (p->max_depth)
        {
        p->h_alpha = (cuda_scalar_t **) malloc(sizeof(cuda_scalar_t *)*p->max_depth);
        p->d_alpha = (cuda_scalar_t **) malloc(sizeof(cuda_scalar_t *)*p->max_depth);
        }

    int d;
    for (d = 0; d < dmax; ++d)
        {
        cudaMalloc((void **)&(p->d_rev_j1[d]), sizeof(int)*ndim);
        CHECK_CUDA();
        cudaMalloc((void **)&(p->d_rev_partial[d]), sizeof(int)*ndim);
        CHECK_CUDA();
        cudaMalloc((void **)&(p->d_rev_global[d]), sizeof(int)*ndim);
        CHECK_CUDA();
        cudaMalloc((void **)&(p->d_c0[d]), sizeof(int)*ndim);
        CHECK_CUDA();
        cudaMalloc((void **)&(p->d_c1[d]), sizeof(int)*ndim);
        CHECK_CUDA();
        }

    for (d = 0; d < p->max_depth; ++d)
        {
        cudaMalloc((void **)&(p->d_alpha[d]), sizeof(cuda_scalar_t)*ndim); 
        CHECK_CUDA();
        p->h_alpha[d] = (cuda_scalar_t *) malloc(sizeof(cuda_scalar_t)*ndim);
        }

    /* perform initialization run */
    dfft_cuda_execute(NULL, NULL, 0, p);

    /* initialization finished */
    p->init = 0;

    return res;
    } 
Example #10
0
void
cg_solve(OperatorType& A,
         const VectorType& b,
         VectorType& x,
         Matvec matvec,
         typename OperatorType::LocalOrdinalType max_iter,
         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& tolerance,
         typename OperatorType::LocalOrdinalType& num_iters,
         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& normr,
         timer_type* my_cg_times)
{
  typedef typename OperatorType::ScalarType ScalarType;
  typedef typename OperatorType::GlobalOrdinalType GlobalOrdinalType;
  typedef typename OperatorType::LocalOrdinalType LocalOrdinalType;
  typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type;

  timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0, tMATVECDOT = 0;
  timer_type total_time = mytimer();

  int myproc = 0;
#ifdef HAVE_MPI
  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
#endif

  if (!A.has_local_indices) {
    std::cerr << "miniFE::cg_solve ERROR, A.has_local_indices is false, needs to be true. This probably means "
       << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::cg_solve."
       << std::endl;
    return;
  }

  size_t nrows = A.rows.size();
  LocalOrdinalType ncols = A.num_cols;

  nvtxRangeId_t r1=nvtxRangeStartA("Allocation of Temporary Vectors");
  VectorType r(b.startIndex, nrows);
  VectorType p(0, ncols);
  VectorType Ap(b.startIndex, nrows);
  nvtxRangeEnd(r1);

#ifdef HAVE_MPI
#ifndef GPUDIRECT
  //TODO move outside?
  cudaHostRegister(&p.coefs[0],ncols*sizeof(typename VectorType::ScalarType),0);
  cudaCheckError();
  if(A.send_buffer.size()>0) cudaHostRegister(&A.send_buffer[0],A.send_buffer.size()*sizeof(typename VectorType::ScalarType),0);
  cudaCheckError();
#endif
#endif

  normr = 0;
  magnitude_type rtrans = 0;
  magnitude_type oldrtrans = 0;

  LocalOrdinalType print_freq = max_iter/10;
  if (print_freq>50) print_freq = 50;
  if (print_freq<1)  print_freq = 1;

  ScalarType one = 1.0;
  ScalarType zero = 0.0;

  TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY);

  TICK();
  matvec(A, p, Ap);
  TOCK(tMATVEC);

  TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY);

  TICK(); rtrans = dot(r, r); TOCK(tDOT);

  normr = std::sqrt(rtrans);

  if (myproc == 0) {
    std::cout << "Initial Residual = "<< normr << std::endl;
  }

  magnitude_type brkdown_tol = std::numeric_limits<magnitude_type>::epsilon();

#ifdef MINIFE_DEBUG
  std::ostream& os = outstream();
  os << "brkdown_tol = " << brkdown_tol << std::endl;
#endif

  for(LocalOrdinalType k=1; k <= max_iter && normr > tolerance; ++k) {
    if (k == 1) {
      TICK(); waxpby(one, r, zero, r, p); TOCK(tWAXPY);
    }
    else {
      oldrtrans = rtrans;
      TICK(); rtrans = dot(r, r); TOCK(tDOT);
      magnitude_type beta = rtrans/oldrtrans;
      TICK(); waxpby(one, r, beta, p, p); TOCK(tWAXPY);
    }

    normr = std::sqrt(rtrans);

    if (myproc == 0 && (k%print_freq==0 || k==max_iter)) {
      std::cout << "Iteration = "<<k<<"   Residual = "<<normr<<std::endl;
    }

    magnitude_type alpha = 0;
    magnitude_type p_ap_dot = 0;

    TICK(); matvec(A, p, Ap); TOCK(tMATVEC);

    TICK(); p_ap_dot = dot(Ap, p); TOCK(tDOT);

#ifdef MINIFE_DEBUG
    os << "iter " << k << ", p_ap_dot = " << p_ap_dot;
    os.flush();
#endif
    //TODO remove false below
    if (false && p_ap_dot < brkdown_tol) {
      if (p_ap_dot < 0 || breakdown(p_ap_dot, Ap, p)) {
        std::cerr << "miniFE::cg_solve ERROR, numerical breakdown!"<<std::endl;
#ifdef MINIFE_DEBUG
        os << "ERROR, numerical breakdown!"<<std::endl;
#endif
        //update the timers before jumping out.
        my_cg_times[WAXPY] = tWAXPY;
        my_cg_times[DOT] = tDOT;
        my_cg_times[MATVEC] = tMATVEC;
        my_cg_times[TOTAL] = mytimer() - total_time;
        return;
      }
      else brkdown_tol = 0.1 * p_ap_dot;
    }
    alpha = rtrans/p_ap_dot;
#ifdef MINIFE_DEBUG
    os << ", rtrans = " << rtrans << ", alpha = " << alpha << std::endl;
#endif

    TICK(); waxpby(one, x, alpha, p, x);
            waxpby(one, r, -alpha, Ap, r); TOCK(tWAXPY);
    num_iters = k;
  }
  
#ifdef HAVE_MPI
#ifndef GPUDIRECT
  //TODO move outside?
  cudaHostUnregister(&p.coefs[0]);
  cudaCheckError();
  if(A.send_buffer.size()>0) cudaHostUnregister(&A.send_buffer[0]);
  cudaCheckError();
#endif
#endif

  my_cg_times[WAXPY] = tWAXPY;
  my_cg_times[DOT] = tDOT;
  my_cg_times[MATVEC] = tMATVEC;
  my_cg_times[MATVECDOT] = tMATVECDOT;
  my_cg_times[TOTAL] = mytimer() - total_time;
}
HostReflectionHost::BootUp::BootUp(const std::string& module)
    : _module(module)
{
    report("Booting up host reflection...");

    // add message handlers
    _addMessageHandlers();

    // allocate memory for the queue
    size_t queueDataSize = maxMessageSize() * 2;
    size_t size = 2 * (queueDataSize + sizeof(QueueMetaData));

    _deviceHostSharedMemory = new char[size];

    // setup the queue meta data
    QueueMetaData* hostToDeviceMetaData =
        (QueueMetaData*)_deviceHostSharedMemory;
    QueueMetaData* deviceToHostMetaData =
        (QueueMetaData*)_deviceHostSharedMemory + 1;

    char* hostToDeviceData = _deviceHostSharedMemory +
                             2 * sizeof(QueueMetaData);
    char* deviceToHostData = _deviceHostSharedMemory +
                             2 * sizeof(QueueMetaData) + queueDataSize;

    hostToDeviceMetaData->hostBegin = hostToDeviceData;
    hostToDeviceMetaData->size      = queueDataSize;
    hostToDeviceMetaData->head      = 0;
    hostToDeviceMetaData->tail      = 0;
    hostToDeviceMetaData->mutex     = (size_t)-1;

    deviceToHostMetaData->hostBegin = deviceToHostData;
    deviceToHostMetaData->size      = queueDataSize;
    deviceToHostMetaData->head      = 0;
    deviceToHostMetaData->tail      = 0;
    deviceToHostMetaData->mutex     = (size_t)-1;

    // Allocate the queues
    _hostToDeviceQueue = new HostQueue(hostToDeviceMetaData);
    _deviceToHostQueue = new HostQueue(deviceToHostMetaData);

    // Map the memory onto the device
    cudaHostRegister(_deviceHostSharedMemory, size, 0);

    char* devicePointer = 0;

    cudaHostGetDevicePointer((void**)&devicePointer,
                             _deviceHostSharedMemory, 0);

    // Send the metadata to the device
    QueueMetaData* hostToDeviceMetaDataPointer =
        (QueueMetaData*)devicePointer;
    QueueMetaData* deviceToHostMetaDataPointer =
        (QueueMetaData*)devicePointer + 1;

    hostToDeviceMetaData->deviceBegin = devicePointer +
                                        2 * sizeof(QueueMetaData);
    deviceToHostMetaData->deviceBegin = devicePointer +
                                        2 * sizeof(QueueMetaData) + queueDataSize;

    cudaConfigureCall(dim3(1, 1, 1), dim3(1, 1, 1), 0, 0);

    cudaSetupArgument(&hostToDeviceMetaDataPointer, 8, 0 );
    cudaSetupArgument(&deviceToHostMetaDataPointer, 8, 8 );
    ocelot::launch(_module, "_bootupHostReflection");

    // start up the host worker thread
    _kill   = false;
    _thread = new boost::thread(_runThread, this);
}