Exemplo n.º 1
0
int TEMPLATE2 (CHOLMOD (gpu_updateC))
(
    Int ndrow1,         /* C is ndrow2-by-ndrow2 */
    Int ndrow2,
    Int ndrow,          /* leading dimension of Lx */
    Int ndcol,          /* L1 is ndrow1-by-ndcol */
    Int nsrow,
    Int pdx1,           /* L1 starts at Lx + L_ENTRY*pdx1 */
    /* L2 starts at Lx + L_ENTRY*(pdx1 + ndrow1) */
    Int pdi1,
    double *Lx,
    double *C,
    cholmod_common *Common,
    cholmod_gpu_pointers *gpu_p
)
{
    double *devPtrLx, *devPtrC ;
    double alpha, beta ;
    cublasStatus_t cublasStatus ;
    cudaError_t cudaStat [2] ;
    Int ndrow3 ;
    int icol, irow;
    int iHostBuff, iDevBuff ;

#ifndef NTIMER
    double tstart = 0;
#endif

    if ((ndrow2*L_ENTRY < CHOLMOD_ND_ROW_LIMIT) ||
        (ndcol*L_ENTRY <  CHOLMOD_ND_COL_LIMIT))
    {
        /* too small for the CUDA BLAS; use the CPU instead */
        return (0) ;
    }

    ndrow3 = ndrow2 - ndrow1 ;

#ifndef NTIMER
    Common->syrkStart = SuiteSparse_time ( ) ;
    Common->CHOLMOD_GPU_SYRK_CALLS++ ;
#endif

    /* ---------------------------------------------------------------------- */
    /* allocate workspace on the GPU */
    /* ---------------------------------------------------------------------- */

    iHostBuff = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS;
    iDevBuff = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS;

    /* cycle the device Lx buffer, d_Lx, through CHOLMOD_DEVICE_STREAMS,
       usually 2, so we can overlap the copy of this descendent supernode
       with the compute of the previous descendant supernode */
    devPtrLx = (double *)(gpu_p->d_Lx[iDevBuff]);
    /* very little overlap between kernels for difference descendant supernodes
       (since we enforce the supernodes must be large enough to fill the
       device) so we only need one C buffer */
    devPtrC = (double *)(gpu_p->d_C);

    /* ---------------------------------------------------------------------- */
    /* copy Lx to the GPU */
    /* ---------------------------------------------------------------------- */

    /* copy host data to pinned buffer first for better H2D bandwidth */
#pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) if (ndcol > 32)
    for ( icol=0; icol<ndcol; icol++ ) {
        for ( irow=0; irow<ndrow2*L_ENTRY; irow++ ) {
            gpu_p->h_Lx[iHostBuff][icol*ndrow2*L_ENTRY+irow] =
                Lx[pdx1*L_ENTRY+icol*ndrow*L_ENTRY + irow];
        }
    }

    cudaStat[0] = cudaMemcpyAsync ( devPtrLx,
        gpu_p->h_Lx[iHostBuff],
        ndrow2*ndcol*L_ENTRY*sizeof(devPtrLx[0]),
        cudaMemcpyHostToDevice,
        Common->gpuStream[iDevBuff] );

    if ( cudaStat[0] ) {
        CHOLMOD_GPU_PRINTF ((" ERROR cudaMemcpyAsync = %d \n", cudaStat[0]));
        return (0);
    }

    /* make the current stream wait for kernels in previous streams */
    cudaStreamWaitEvent ( Common->gpuStream[iDevBuff],
                          Common->updateCKernelsComplete, 0 ) ;

    /* ---------------------------------------------------------------------- */
    /* create the relative map for this descendant supernode */
    /* ---------------------------------------------------------------------- */

    createRelativeMapOnDevice ( (Int *)(gpu_p->d_Map),
                                (Int *)(gpu_p->d_Ls),
                                (Int *)(gpu_p->d_RelativeMap),
                                pdi1, ndrow2,
                                &(Common->gpuStream[iDevBuff]) );

    /* ---------------------------------------------------------------------- */
    /* do the CUDA SYRK */
    /* ---------------------------------------------------------------------- */

    cublasStatus = cublasSetStream (Common->cublasHandle,
                                    Common->gpuStream[iDevBuff]) ;
    if (cublasStatus != CUBLAS_STATUS_SUCCESS)
    {
        ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream") ;
    }

    alpha  = 1.0 ;
    beta   = 0.0 ;

#ifdef REAL
    cublasStatus = cublasDsyrk (Common->cublasHandle,
        CUBLAS_FILL_MODE_LOWER,
        CUBLAS_OP_N,
        (int) ndrow1,
        (int) ndcol,    /* N, K: L1 is ndrow1-by-ndcol */
        &alpha,         /* ALPHA:  1 */
        devPtrLx,
        ndrow2,         /* A, LDA: L1, ndrow2 */
        &beta,          /* BETA:   0 */
        devPtrC,
        ndrow2) ;       /* C, LDC: C1 */
#else
    cublasStatus = cublasZherk (Common->cublasHandle,
        CUBLAS_FILL_MODE_LOWER,
        CUBLAS_OP_N,
        (int) ndrow1,
        (int) ndcol,    /* N, K: L1 is ndrow1-by-ndcol*/
        &alpha,         /* ALPHA:  1 */
        (const cuDoubleComplex *) devPtrLx,
        ndrow2,         /* A, LDA: L1, ndrow2 */
        &beta,          /* BETA:   0 */
        (cuDoubleComplex *) devPtrC,
        ndrow2) ;       /* C, LDC: C1 */
#endif

    if (cublasStatus != CUBLAS_STATUS_SUCCESS)
    {
        ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
    }

#ifndef NTIMER
    Common->CHOLMOD_GPU_SYRK_TIME += SuiteSparse_time() - Common->syrkStart;
#endif

    /* ---------------------------------------------------------------------- */
    /* compute remaining (ndrow2-ndrow1)-by-ndrow1 block of C, C2 = L2*L1'    */
    /* ---------------------------------------------------------------------- */

#ifndef NTIMER
    Common->CHOLMOD_GPU_GEMM_CALLS++ ;
    tstart = SuiteSparse_time();
#endif

    if (ndrow3 > 0)
    {
#ifndef REAL
        cuDoubleComplex calpha  = {1.0,0.0} ;
        cuDoubleComplex cbeta   = {0.0,0.0} ;
#endif

        /* ------------------------------------------------------------------ */
        /* do the CUDA BLAS dgemm */
        /* ------------------------------------------------------------------ */

#ifdef REAL
        alpha  = 1.0 ;
        beta   = 0.0 ;
        cublasStatus = cublasDgemm (Common->cublasHandle,
            CUBLAS_OP_N, CUBLAS_OP_T,
            ndrow3, ndrow1, ndcol,          /* M, N, K */
            &alpha,                         /* ALPHA:  1 */
            devPtrLx + L_ENTRY*(ndrow1),    /* A, LDA: L2*/
            ndrow2,                         /* ndrow */
            devPtrLx,                       /* B, LDB: L1 */
            ndrow2,                         /* ndrow */
            &beta,                          /* BETA:   0 */
            devPtrC + L_ENTRY*ndrow1,       /* C, LDC: C2 */
            ndrow2) ;
#else
        cublasStatus = cublasZgemm (Common->cublasHandle,
            CUBLAS_OP_N, CUBLAS_OP_C,
            ndrow3, ndrow1, ndcol,          /* M, N, K */
            &calpha,                        /* ALPHA:  1 */
            (const cuDoubleComplex*) devPtrLx + ndrow1,
            ndrow2,                         /* ndrow */
            (const cuDoubleComplex *) devPtrLx,
            ndrow2,                         /* ndrow */
            &cbeta,                         /* BETA:   0 */
            (cuDoubleComplex *)devPtrC + ndrow1,
            ndrow2) ;
#endif

        if (cublasStatus != CUBLAS_STATUS_SUCCESS)
        {
            ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ;
        }

    }

#ifndef NTIMER
    Common->CHOLMOD_GPU_GEMM_TIME += SuiteSparse_time() - tstart;
#endif

    /* ------------------------------------------------------------------ */
    /* Assemble the update C on the device using the d_RelativeMap */
    /* ------------------------------------------------------------------ */

#ifdef REAL
    addUpdateOnDevice ( gpu_p->d_A[0], devPtrC,
        gpu_p->d_RelativeMap, ndrow1, ndrow2, nsrow,
        &(Common->gpuStream[iDevBuff]) );
#else
    addComplexUpdateOnDevice ( gpu_p->d_A[0], devPtrC,
        gpu_p->d_RelativeMap, ndrow1, ndrow2, nsrow,
        &(Common->gpuStream[iDevBuff]) );
#endif

    /* Record an event indicating that kernels for
       this descendant are complete */
    cudaEventRecord ( Common->updateCKernelsComplete,
                      Common->gpuStream[iDevBuff]);
    cudaEventRecord ( Common->updateCBuffersFree[iHostBuff],
                      Common->gpuStream[iDevBuff]);

    return (1) ;
}
Exemplo n.º 2
0
void TEMPLATE2 (CHOLMOD (gpu_final_assembly))
(
    cholmod_common *Common,
    double *Lx,
    Int psx,
    Int nscol,
    Int nsrow,
    int supernodeUsedGPU,
    int *iHostBuff,
    int *iDevBuff,
    cholmod_gpu_pointers *gpu_p
)
{
    Int iidx, i, j;
    Int iHostBuff2 ;
    Int iDevBuff2 ;

    if ( supernodeUsedGPU ) {

        /* ------------------------------------------------------------------ */
        /* Apply all of the Shur-complement updates, computed on the gpu, to */
        /* the supernode. */
        /* ------------------------------------------------------------------ */

        *iHostBuff = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS;
        *iDevBuff = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS;

        if ( nscol * L_ENTRY >= CHOLMOD_POTRF_LIMIT ) {

            /* If this supernode is going to be factored using the GPU (potrf)
             * then it will need the portion of the update assembled ont the
             * CPU.  So copy that to a pinned buffer an H2D copy to device. */

            /* wait until a buffer is free */
            cudaEventSynchronize ( Common->updateCBuffersFree[*iHostBuff] );

            /* copy update assembled on CPU to a pinned buffer */

#pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
    private(iidx) if (nscol>32)

            for ( j=0; j<nscol; j++ ) {
                for ( i=j; i<nsrow*L_ENTRY; i++ ) {
                    iidx = j*nsrow*L_ENTRY+i;
                    gpu_p->h_Lx[*iHostBuff][iidx] = Lx[psx*L_ENTRY+iidx];
                }
            }

            /* H2D transfer of update assembled on CPU */
            cudaMemcpyAsync ( gpu_p->d_A[1], gpu_p->h_Lx[*iHostBuff],
                              nscol*nsrow*L_ENTRY*sizeof(double),
                              cudaMemcpyHostToDevice,
                              Common->gpuStream[*iDevBuff] );
        }

        Common->ibuffer++;

        iHostBuff2 = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS;
        iDevBuff2 = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS;

        /* wait for all kernels to complete */
        cudaEventSynchronize( Common->updateCKernelsComplete );

        /* copy assembled Schur-complement updates computed on GPU */
        cudaMemcpyAsync ( gpu_p->h_Lx[iHostBuff2], gpu_p->d_A[0],
                          nscol*nsrow*L_ENTRY*sizeof(double),
                          cudaMemcpyDeviceToHost,
                          Common->gpuStream[iDevBuff2] );

        if ( nscol * L_ENTRY >= CHOLMOD_POTRF_LIMIT ) {

            /* with the current implementation, potrf still uses data from the
             * CPU - so put the fully assembled supernode in a pinned buffer for
             * fastest access */

            /* need both H2D and D2H copies to be complete */
            cudaDeviceSynchronize();

            /* sum updates from cpu and device on device */
#ifdef REAL
            sumAOnDevice ( gpu_p->d_A[1], gpu_p->d_A[0], -1.0, nsrow, nscol );
#else
            sumComplexAOnDevice ( gpu_p->d_A[1], gpu_p->d_A[0],
                                  -1.0, nsrow, nscol );
#endif

            /* place final assembled supernode in pinned buffer */

#pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
    private(iidx) if (nscol>32)

            for ( j=0; j<nscol; j++ ) {
                for ( i=j*L_ENTRY; i<nscol*L_ENTRY; i++ ) {
                    iidx = j*nsrow*L_ENTRY+i;
                    gpu_p->h_Lx[*iHostBuff][iidx] -=
                        gpu_p->h_Lx[iHostBuff2][iidx];
                }
            }

        }
        else
        {

            /* assemble with CPU updates */
            cudaDeviceSynchronize();

#pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS)   \
    private(iidx) if (nscol>32)

            for ( j=0; j<nscol; j++ ) {
                for ( i=j*L_ENTRY; i<nsrow*L_ENTRY; i++ ) {
                    iidx = j*nsrow*L_ENTRY+i;
                    Lx[psx*L_ENTRY+iidx] -= gpu_p->h_Lx[iHostBuff2][iidx];
                }
            }
        }
    }
    return;
}
Exemplo n.º 3
0
void GpuDevice::DoCopyRemoteData(float* dst, float* src, size_t size, int thrid) {
  CUDA_CALL(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, impl_->stream[thrid]));
  CUDA_CALL(cudaStreamSynchronize(impl_->stream[thrid]));
}
Exemplo n.º 4
0
int main( int argc, char*argv[] )
{
  float *p_old,*p_new,*p_tmp;
  int    n,nn;
  float  gosa,gflops,thruput,thruput2;
  double time_start,time_max,target,bytes;
  cudaStream_t stream_top,stream_btm;

  NP=1;
  gpu=0;
  ME=0;

  target= 60.0;
  omega= 0.8f;
  imax = MIMAX-1;
  jmax = MJMAX-1;
  kmax = MKMAX-1;
  imax_global = NP*(imax-2)+2;
  nn = ITERS;

  if(ME==0)
  {
    printf("\n mimax = %d mjmax = %d mkmax = %d pitch = %d\n",MIMAX, MJMAX, MKMAX, PITCH);
    printf(" imax = %d jmax = %d kmax = %d\n",imax_global,jmax,kmax);
    printf(" gridX = %d  gridY = %d  blockX = %d  blockY = %d\n", GRID_X, GRID_Y, BLOCK_X, BLOCK_Y);
  }
  //printf("There are %d processes, I am process# %d using GPU %d\n",NP,ME,gpu);
  
  CUDA_SAFE_CALL(cudaSetDevice(gpu));
  stream_top = 0; 
  stream_btm = 0;

#if (CUDART_VERSION >= 3000)
  {
#if (CUDART_VERSION > 3000)
      struct cudaDeviceProp prop;
      // display ECC configuration, only queryable post r3.0
      CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, gpu));
      printf (" ECC on GPU %d is %s\n", gpu, prop.ECCEnabled ? "ON" : "OFF");
#endif /* CUDART_VERSION > 3000 */
      // configure kernels for large shared memory to get better occupancy
      printf (" Configuring GPU L1 cache size ...\n");
      set_kernel_cache_config (cudaFuncCachePreferShared);
  }
#endif /* CUDART_VERSION >= 3000 */

  CUDA_SAFE_CALL(cudaStreamCreate(&stream_top));
  CUDA_SAFE_CALL(cudaStreamCreate(&stream_btm));

  if(ME==0) printf(" Allocating Memory...\n");
  allocate_memory();
  if(ME==0) printf(" Initializing Data...\n\n");
  initmt();
  
  if(ME==0)
  {
    printf(" Now, start GPU measurement process.\n");
    printf(" The loop will be excuted %d times\n",nn);
    printf(" Wait for a while\n\n");
  }

  time_start = wallclock();

  gosa = 0.0f;
  p_new = p2_d; p_old = p1_d;
  
  for(n=0 ; n<nn; n++)
  {
    //swap pointers
    p_tmp = p_new; p_new = p_old; p_old = p_tmp;
    jacobi_GPU_btm_even (stream_btm,a0_d,a1_d,a2_d,a3_d,b0_d,b1_d,b2_d,c0_d,
                         c1_d,c2_d,wrk_d,bnd_d,p_old,p_new,gosa_d,omega,n);

    cudaMemcpyAsync (gosa_btm, gosa_d, sizeof(float), cudaMemcpyDeviceToHost,
                     stream_btm);
    // Since we want to print intermediate values of gosa every PRINT_ITER
    // iterations, we need to synchronize before picking up the asynchronously 
    // updated value.
    if (!(n % PRINT_ITER)) {
        cudaStreamSynchronize(stream_btm);
        gosa = *gosa_btm;
    }
    if(ME==0 && n%PRINT_ITER==0) printf(" iter: %d \tgosa: %e\n",n,gosa);
  }

  cudaThreadSynchronize();
  gosa = *gosa_btm;
  time_max = wallclock() - time_start;

  gflops   = (float)(34.0*( (double)nn*(double)(imax_global-2)*(double)(jmax-2)*(double)(kmax-2) ) / time_max * 1e-9);
  bytes    = NP*((double)nn*(56.0*(imax-2)+8.0)*(double)(jmax)*(double)(kmax));
  thruput  = (float)(bytes / time_max / 1024.0 / 1024.0 / 1024.0);
  thruput2 = (float)(bytes / time_max / 1e9);

  if(ME==0)
  {
    printf(" \nLoop executed for %d times\n",nn);
    printf(" Gosa : %e \n",gosa);
    printf(" total Compute   : %4.1f GFLOPS\ttime : %f seconds\n",gflops,time_max);
    printf(" total Bandwidth : %4.1f GB/s\n", thruput);
    printf(" total Bandwidth : %4.1f GB/s (STREAM equivalent)\n",thruput2);
    printf(" Score based on Pentium III 600MHz : %f\n\n",1000.0*gflops/82.0);
  }
  cleanup();

  CUDA_SAFE_CALL(cudaStreamDestroy(stream_top));
  CUDA_SAFE_CALL(cudaStreamDestroy(stream_btm));

  //check_results();
  return (EXIT_SUCCESS);
}
Exemplo n.º 5
0
/* created thread, all this calls are in the thread context */
void *upro_gpu_worker_main(upro_gpu_worker_context_t *context)
{
	upro_timer_t t, loopcounter;
	upro_log_t log;
	int i, id = 0, start = 0;
	double elapsed_time;
	upro_batch_buf_t *buf;

	assert(config->cpu_worker_num <= 16);
	cudaStream_t stream[MAX_GPU_STREAM];
	for (i = 0; i < MAX_GPU_STREAM; i ++) {
		cudaStreamCreate(&stream[i]);
	}

	/* Init timers */
	upro_timer_init(&t); // For separate events
	upro_timer_init(&counter); // For the whole program
	upro_timer_init(&loopcounter); // For each loop
	upro_log_init(&log);

	/* Initialize GPU worker, we wait for that all CPU workers have been initialized
	 * then we can init GPU worker with the batches of CPU worker */
	upro_gpu_worker_t g;
	upro_gpu_worker_init(&g, context);

	printf("GPU Worker is working on core %d ...\n", context->core_id);

	/* Timers for each kernel launch */
	upro_timer_restart(&loopcounter);
	
	i = 0;
	for (;;) {
		upro_log_loop_marker(&log);

		i ++;
		if (upro_unlikely(i == 300)) {
			printf("-------------------------------------------------------------------------------------------------\n");
			upro_timer_restart(&counter);
			total_packets = 0;
		}

		//////////////////////////////////////////
		/* This is a CPU/GPU synchronization point */
		do {
			elapsed_time = upro_timer_get_elapsed_time(&loopcounter);
			if (elapsed_time - config->I > 0.01) { // surpassed the time point more than 1 ms
				upro_log_msg(&log, "\n%s %lf\n", "--- [GPU Worker] Time point lost! : ", elapsed_time);
				// assert(0);
			}
		} while ((double)(config->I) - elapsed_time > 0.01);

		upro_log_msg(&log, "%s %lf\n", "--- [GPU Worker] Time point arrived : ", elapsed_time);
		upro_timer_restart(&loopcounter);
		//////////////////////////////////////////

		/* Get Input Buffer from CPU Workers */
		upro_gpu_get_batch(&g, context->cpu_batch_set);
		upro_timer_restart(&t);

		for (id = 0; id < config->cpu_worker_num; id ++) {
			buf = g.bufs[g.cur_buf_id][id];

			total_packets += buf->job_num;
#if defined(NOT_GPU)
			printf("%d,", buf->job_num);
			continue;
#endif

			if (buf->job_num == 0) {
				printf("%d,", buf->job_num);
				continue;
			} else {
				printf("%d,", buf->job_num);
				/*
				if (upro_unlikely(start == 0))	{
					upro_timer_restart(&counter);
					start = 1;
				}*/
			}

			// FOR DEBUG 
			/*
			{
				int j;
				for (j = 0; j < buf->job_num; j ++) {
					assert(((uint16_t *)(buf->length_pos))[j] == 1328);
					assert(((uint32_t *)(buf->pkt_offset_pos))[j] == 1344 * j);
					uint64_t a = *(uint32_t *) ((uint8_t *)buf->input_buf + 1344 * j);
					assert(a == 0x01006080);
				}
			}
			*/

#if defined(TRANSFER_SEPERATE)
			cudaMemcpyAsync(buf->input_buf_d, buf->input_buf, buf->buf_length, cudaMemcpyHostToDevice, stream[id]);
			cudaMemcpyAsync(buf->aes_key_pos_d, buf->aes_key_pos, AES_KEY_SIZE * buf->job_num, cudaMemcpyHostToDevice, stream[id]);
			cudaMemcpyAsync(buf->aes_iv_pos_d, buf->aes_iv_pos, AES_IV_SIZE * buf->job_num, cudaMemcpyHostToDevice, stream[id]);
			cudaMemcpyAsync(buf->pkt_offset_pos_d, buf->pkt_offset_pos, PKT_OFFSET_SIZE * buf->job_num, cudaMemcpyHostToDevice, stream[id]);
			cudaMemcpyAsync(buf->length_pos_d, buf->length_pos, PKT_LENGTH_SIZE * buf->job_num, cudaMemcpyHostToDevice, stream[id]);
			cudaMemcpyAsync(buf->hmac_key_pos_d, buf->hmac_key_pos, HMAC_KEY_SIZE * buf->job_num, cudaMemcpyHostToDevice, stream[id]);
#else
			cudaMemcpyAsync(buf->input_buf_d, buf->input_buf, alloc_size, cudaMemcpyHostToDevice, stream[id]);
#endif

			co_aes_sha1_gpu (
				buf->input_buf_d,
				buf->input_buf_d, // output_buf = input_buf, we do not allocate output now
				buf->aes_key_pos_d,
				buf->aes_iv_pos_d,
				buf->hmac_key_pos_d,
				buf->pkt_offset_pos_d,
				buf->length_pos_d,
				buf->job_num,
				NULL,
				256, // the library requires to initialize the T-box
				stream[id]);

			cudaMemcpyAsync(buf->input_buf, buf->input_buf_d, buf->buf_length, cudaMemcpyDeviceToHost, stream[id]);
		}

		cudaDeviceSynchronize();

		upro_timer_stop(&t);
		upro_log_msg(&log, "\n%s %lf ms\n", "--- [GPU Worker] Execution Time :", upro_timer_get_total_time(&t));

		/* Tell the forwarders that this batch has been processed */
		upro_gpu_give_to_forwarder(&g, context->cpu_batch_set);
	}

	upro_timer_stop(&counter);
	printf("End of execution, now the program costs : %f ms\n", upro_timer_get_total_time(&counter));
	// printf("Processing speed is %.2f Mbps\n", (bytes * 8) / (1e3 * upro_timer_get_total_time(&counter)));
	// upro_log_print(&log);

	return 0;
}
Exemplo n.º 6
0
cudaError_t WINAPI wine_cudaMemcpyAsync( void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ) {
    WINE_TRACE("\n");
    return cudaMemcpyAsync( dst, src, count, kind, stream );
}
Exemplo n.º 7
0
gaspi_return_t
pgaspi_gpu_write_notify(const gaspi_segment_id_t segment_id_local,
			const gaspi_offset_t offset_local,
			const gaspi_rank_t rank,
			const gaspi_segment_id_t segment_id_remote,
			const gaspi_offset_t offset_remote,
			const gaspi_size_t size,
			const gaspi_notification_id_t notification_id,
			const gaspi_notification_t notification_value,
			const gaspi_queue_id_t queue,
			const gaspi_timeout_t timeout_ms)
{

  if(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId < 0 ||
     size <= GASPI_GPU_DIRECT_MAX )
    {
      return gaspi_write_notify(segment_id_local, offset_local, rank, segment_id_remote, offset_remote, size,notification_id, notification_value, queue, timeout_ms);
    }

  if(lock_gaspi_tout (&glb_gaspi_ctx.lockC[queue], timeout_ms))
    return GASPI_TIMEOUT;

  char *host_ptr = (char*)(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].host_ptr+NOTIFY_OFFSET+offset_local);
  char* device_ptr =(char*)(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].addr+offset_local);

  gaspi_gpu* agpu = _gaspi_find_gpu(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId);
  if( !agpu )
    {
      gaspi_print_error("No GPU found or not initialized (gaspi_init_GPUs).");
      unlock_gaspi(&glb_gaspi_ctx.lockC[queue]);
      return GASPI_ERROR;
    }

  int copy_size = 0;
  int gpu_offset = 0;
  int size_left = size;
  int BLOCK_SIZE= GASPI_GPU_BUFFERED;

  const gaspi_cycles_t s0 = gaspi_get_cycles ();

  while(size_left > 0)
    {
      int i;
      for(i = 0; i < GASPI_CUDA_EVENTS; i++)
	{
	  if(size_left > BLOCK_SIZE)
	    copy_size = BLOCK_SIZE;
	  else
	    copy_size = size_left;

	  if(cudaMemcpyAsync(host_ptr+gpu_offset, device_ptr + gpu_offset, copy_size, cudaMemcpyDeviceToHost, agpu->streams[queue]))
	    {
	      unlock_gaspi(&glb_gaspi_ctx.lockC[queue]);
	      return GASPI_ERROR;
	    }

	  glb_gaspi_ctx.ne_count_c[queue]++;

	  agpu->events[queue][i].segment_remote = segment_id_remote;
	  agpu->events[queue][i].segment_local = segment_id_local;
	  agpu->events[queue][i].size = copy_size;
	  agpu->events[queue][i].rank = rank;
	  agpu->events[queue][i].offset_local = offset_local+gpu_offset;
	  agpu->events[queue][i].offset_remote = offset_remote+gpu_offset;
	  agpu->events[queue][i].in_use  = 1;
	  cudaError_t err = cudaEventRecord(agpu->events[queue][i].event,agpu->streams[queue]);
	  if(err != cudaSuccess)
	    {
	      unlock_gaspi(&glb_gaspi_ctx.lockC[queue]);
	      return GASPI_ERROR;
	    }
	  /* Thats not beautiful at all, however, else we have a overflow soon in the queue */
	  if(agpu->events[queue][i].ib_use)
	    {
	      struct ibv_wc wc;
	      int ne;
	      do
		{
		  ne = ibv_poll_cq (glb_gaspi_ctx_ib.scqC[queue], 1, &wc);
		  glb_gaspi_ctx.ne_count_c[queue] -= ne;
		  if (ne == 0)
		    {
		      const gaspi_cycles_t s1 = gaspi_get_cycles ();
		      const gaspi_cycles_t tdelta = s1 - s0;

		      const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs;
		      if (ms > timeout_ms)
			{
			  unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);
			  return GASPI_TIMEOUT;
			}
		    }

		} while(ne == 0);
	      agpu->events[queue][i].ib_use = 0;
	    }

	  gpu_offset += copy_size;
	  size_left -= copy_size;
	  if(size_left == 0)
	    break;
	}

      for(i = 0; i < GASPI_CUDA_EVENTS; i++)
	{
	  cudaError_t error;
	  if (agpu->events[queue][i].in_use == 1 )
	    {
	      do
		{
		  error = cudaEventQuery(agpu->events[queue][i].event );
		  if( cudaSuccess == error )
		    {
		      if (_gaspi_event_send(&agpu->events[queue][i],queue) )
			{
			  unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);
			  return GASPI_ERROR;
			}

		      agpu->events[queue][i].in_use  = 0;
		    }
		  else if(error == cudaErrorNotReady)
		    {
		      const gaspi_cycles_t s1 = gaspi_get_cycles ();
		      const gaspi_cycles_t tdelta = s1 - s0;

		      const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs;
		      if (ms > timeout_ms)
			{
			  unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);
			  return GASPI_TIMEOUT;
			}
		    }
		  else
		    {
		      unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);
		      return GASPI_ERROR;
		    }
		} while(error != cudaSuccess);
	    }
	}
    }

  struct ibv_send_wr *bad_wr;
  struct ibv_sge slistN;
  struct ibv_send_wr swrN;

  slistN.addr = (uintptr_t)(glb_gaspi_ctx.nsrc.buf + notification_id * sizeof(gaspi_notification_id_t));

  *((unsigned int *) slistN.addr) = notification_value;

  slistN.length = sizeof(gaspi_notification_id_t);
  slistN.lkey =((struct ibv_mr *) glb_gaspi_ctx.nsrc.mr)->lkey;

  if((glb_gaspi_ctx.rrmd[segment_id_remote][rank].cudaDevId >= 0))
    {
      swrN.wr.rdma.remote_addr = (glb_gaspi_ctx.rrmd[segment_id_remote][rank].host_addr + notification_id * sizeof(gaspi_notification_id_t));
      swrN.wr.rdma.rkey = glb_gaspi_ctx.rrmd[segment_id_remote][rank].host_rkey;
    }
  else
    {
      swrN.wr.rdma.remote_addr = (glb_gaspi_ctx.rrmd[segment_id_remote][rank].addr + notification_id * sizeof(gaspi_notification_id_t));
      swrN.wr.rdma.rkey = glb_gaspi_ctx.rrmd[segment_id_remote][rank].rkey;
    }

  swrN.sg_list = &slistN;
  swrN.num_sge = 1;
  swrN.wr_id = rank;
  swrN.opcode = IBV_WR_RDMA_WRITE;
  swrN.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE;;
  swrN.next = NULL;

  if (ibv_post_send (glb_gaspi_ctx_ib.qpC[queue][rank], &swrN, &bad_wr))
    {
      glb_gaspi_ctx.qp_state_vec[queue][rank] = GASPI_STATE_CORRUPT;
      unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);
      return GASPI_ERROR;
    }

  glb_gaspi_ctx.ne_count_c[queue]++;

  unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);

  return GASPI_SUCCESS;
}
Exemplo n.º 8
0
void THCStorage_resize(THCState *state, THCStorage *self, ptrdiff_t size)
{
  THArgCheck(size >= 0, 2, "invalid size");
  THAssert(self->allocator != NULL);
  int device;
  THCudaCheck(cudaGetDevice(&device));

  if(!(self->flag & TH_STORAGE_RESIZABLE))
    THError("Trying to resize storage that is not resizable");

  size_t elementSize = at::elementSize(self->scalar_type);

  if (self->allocator->realloc) {
    void * data_ptr = self->data_ptr;
    cudaError_t err = (*self->allocator->realloc)(
      self->allocatorContext,
      (void**)&(data_ptr),
      self->size * elementSize,
      size * elementSize, THCState_getCurrentStreamOnDevice(state, device));
    if (err != cudaSuccess) {
      THCudaCheck(err);
    }
    self->size = size;
    self->device = device;
    return;
  }

  if(size == 0)
  {
    if(self->flag & TH_STORAGE_FREEMEM) {
      THCudaCheck(
        (*self->allocator->free)(self->allocatorContext, self->data_ptr));
    }
    self->data_ptr = NULL;
    self->size = 0;
    self->device = device;
  }
  else
  {
    void *data = NULL;
    cudaError_t err =
      (*self->allocator->malloc)(self->allocatorContext,
                                 (void**)&(data),
                                 size * elementSize,
                                 THCState_getCurrentStreamOnDevice(state, device));
    THCudaCheck(err);

    if (self->data_ptr) {
      // Enable p2p access when the memcpy is across devices
      THCState_getPeerToPeerAccess(state, device, self->device);

      THCudaCheck(cudaMemcpyAsync(data,
                                  self->data_ptr,
                                  THMin(self->size, size) * elementSize,
                                  cudaMemcpyDeviceToDevice,
                                  THCState_getCurrentStream(state)));
      if(self->flag & TH_STORAGE_FREEMEM) {
        THCudaCheck(
          (*self->allocator->free)(self->allocatorContext, self->data_ptr));
      }
    }

    self->data_ptr = data;
    self->size = size;
    self->device = device;
  }
}
Exemplo n.º 9
0
    static void where(Param<uint> &out, CParam<T> in)
    {
        uint threads_x = nextpow2(std::max(32u, (uint)in.dims[0]));
        threads_x = std::min(threads_x, THREADS_PER_BLOCK);
        uint threads_y = THREADS_PER_BLOCK / threads_x;

        uint blocks_x = divup(in.dims[0], threads_x * REPEAT);
        uint blocks_y = divup(in.dims[1], threads_y);

        Param<uint> rtmp;
        Param<uint> otmp;
        rtmp.dims[0] = blocks_x;
        otmp.dims[0] = in.dims[0];
        rtmp.strides[0] = 1;
        otmp.strides[0] = 1;

        for (int k = 1; k < 4; k++) {
            rtmp.dims[k] = in.dims[k];
            rtmp.strides[k] = rtmp.strides[k - 1] * rtmp.dims[k - 1];

            otmp.dims[k] = in.dims[k];
            otmp.strides[k] = otmp.strides[k - 1] * otmp.dims[k - 1];
        }

        int rtmp_elements = rtmp.strides[3] * rtmp.dims[3];
        rtmp.ptr = memAlloc<uint>(rtmp_elements);

        int otmp_elements = otmp.strides[3] * otmp.dims[3];
        otmp.ptr = memAlloc<uint>(otmp_elements);

        scan_first_launcher<T, uint, af_notzero_t, false, true>(otmp, rtmp, in,
                                                          blocks_x, blocks_y,
                                                          threads_x);

        // Linearize the dimensions and perform scan
        Param<uint> ltmp = rtmp;
        ltmp.dims[0] = rtmp_elements;
        for (int k = 1; k < 4; k++) {
            ltmp.dims[k] = 1;
            ltmp.strides[k] = rtmp_elements;
        }

        scan_first<uint, uint, af_add_t, true>(ltmp, ltmp);

        // Get output size and allocate output
        uint total;
        CUDA_CHECK(cudaMemcpyAsync(&total, rtmp.ptr + rtmp_elements - 1,
                              sizeof(uint), cudaMemcpyDeviceToHost,
                              cuda::getStream(cuda::getActiveDeviceId())));
        CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId())));

        out.ptr = memAlloc<uint>(total);

        out.dims[0] = total;
        out.strides[0] = 1;
        for (int k = 1; k < 4; k++) {
            out.dims[k] = 1;
            out.strides[k] = total;
        }

        dim3 threads(threads_x, THREADS_PER_BLOCK / threads_x);
        dim3 blocks(blocks_x * in.dims[2],
                    blocks_y * in.dims[3]);

        uint lim = divup(otmp.dims[0], (threads_x * blocks_x));

        CUDA_LAUNCH((get_out_idx<T>), blocks, threads,
                out.ptr, otmp, rtmp, in, blocks_x, blocks_y, lim);
        POST_LAUNCH_CHECK();

        memFree(rtmp.ptr);
        memFree(otmp.ptr);
    }
Exemplo n.º 10
0
gaspi_return_t
pgaspi_gpu_write(const gaspi_segment_id_t segment_id_local,
		 const gaspi_offset_t offset_local,
		 const gaspi_rank_t rank,
		 const gaspi_segment_id_t segment_id_remote,
		 const gaspi_offset_t offset_remote,
		 const gaspi_size_t size,
		 const gaspi_queue_id_t queue,
		 const gaspi_timeout_t timeout_ms)
{
  if( glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId < 0 ||
      size <= GASPI_GPU_DIRECT_MAX )
    {
      return gaspi_write(segment_id_local, offset_local, rank, segment_id_remote, offset_remote, size, queue, timeout_ms);
    }

  if(lock_gaspi_tout (&glb_gaspi_ctx.lockC[queue], timeout_ms))
    return GASPI_TIMEOUT;

  char* host_ptr = (char*)(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].host_ptr + NOTIFY_OFFSET + offset_local);
  char* device_ptr = (char*)(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].addr + offset_local);

  gaspi_gpu* agpu =  _gaspi_find_gpu(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId);
  if( !agpu )
    {
      gaspi_print_error("No GPU found or not initialized (gaspi_init_GPUs).");
      return GASPI_ERROR;
    }

  int size_left = size;
  int copy_size = 0;
  int gpu_offset = 0;
  const int BLOCK_SIZE = GASPI_GPU_BUFFERED;

  const gaspi_cycles_t s0 = gaspi_get_cycles ();

  while(size_left > 0)
    {
      int i;
      for(i = 0; i < GASPI_CUDA_EVENTS; i++)
	{
	  if(size_left > BLOCK_SIZE)
	    copy_size = BLOCK_SIZE;
	  else
	    copy_size = size_left;

	  if( cudaMemcpyAsync(host_ptr + gpu_offset, device_ptr + gpu_offset, copy_size, cudaMemcpyDeviceToHost, agpu->streams[queue]))
	    {
	      unlock_gaspi(&glb_gaspi_ctx.lockC[queue]);
	      return GASPI_ERROR;
	    }

	  glb_gaspi_ctx.ne_count_c[queue]++;

	  agpu->events[queue][i].segment_remote = segment_id_remote;
	  agpu->events[queue][i].segment_local = segment_id_local;
	  agpu->events[queue][i].size = copy_size;
	  agpu->events[queue][i].rank = rank;
	  agpu->events[queue][i].offset_local = offset_local+gpu_offset;
	  agpu->events[queue][i].offset_remote = offset_remote+gpu_offset;
	  agpu->events[queue][i].in_use =1;

	  cudaError_t err = cudaEventRecord(agpu->events[queue][i].event, agpu->streams[queue]);
	  if(err != cudaSuccess)
	    {
	      glb_gaspi_ctx.qp_state_vec[queue][rank] = GASPI_STATE_CORRUPT;
	      unlock_gaspi(&glb_gaspi_ctx.lockC[queue]);
	      return GASPI_ERROR;
	    }

	  gpu_offset += copy_size;
	  size_left -= copy_size;

	  if(size_left == 0)
	    break;

	  if(agpu->events[queue][i].ib_use)
	    {
	      struct ibv_wc wc;
	      int ne;
	      do
		{
		  ne = ibv_poll_cq (glb_gaspi_ctx_ib.scqC[queue], 1, &wc);
		  glb_gaspi_ctx.ne_count_c[queue] -= ne;
		  if (ne == 0)
		    {
		      const gaspi_cycles_t s1 = gaspi_get_cycles ();
		      const gaspi_cycles_t tdelta = s1 - s0;

		      const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs;
		      if (ms > timeout_ms)
			{
			  unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);
			  return GASPI_TIMEOUT;
			}
		    }
		} while(ne==0);
	      agpu->events[queue][i].ib_use = 0;
	    }
	}

      for(i = 0; i < GASPI_CUDA_EVENTS; i++)
	{
	  cudaError_t error;
	  if ( agpu->events[queue][i].in_use == 1 )
	    {
	      do
		{
		  error = cudaEventQuery(agpu->events[queue][i].event );
		  if( cudaSuccess == error )
		    {
		      if (_gaspi_event_send(&agpu->events[queue][i],queue))
			{
			  unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);
			  return GASPI_ERROR;
			}

		      agpu->events[queue][i].in_use = 0;
		    }
		  else if(error == cudaErrorNotReady)
		    {
		      const gaspi_cycles_t s1 = gaspi_get_cycles ();
		      const gaspi_cycles_t tdelta = s1 - s0;

		      const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs;
		      if (ms > timeout_ms)
			{
			  unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);
			  return GASPI_TIMEOUT;
			}
		    }
		  else
		    {
		      unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);
		      return GASPI_ERROR;
		    }
		} while(error != cudaSuccess);
	    }
	}
    }

  unlock_gaspi (&glb_gaspi_ctx.lockC[queue]);

  return GASPI_SUCCESS;
}
Exemplo n.º 11
0
int main(int argc,char **argv){

// Print GPU properties
//print_properties();

// Files to print the result after the last time step
FILE *rho_file;
FILE *E_file;
rho_file = fopen("rho_final.txt", "w");
E_file = fopen("E_final.txt", "w");

// Construct initial condition for problem
ICsinus Config(-1.0, 1.0, -1.0, 1.0); 
//ICsquare Config(0.5,0.5,gasGam);

// Set initial values for Configuration 1
/*
Config.set_rho(rhoConfig19);
Config.set_pressure(pressureConfig19);
Config.set_u(uConfig19);
Config.set_v(vConfig19);
*/

// Determining global border based on left over tiles (a little hack)
int globalPadding;
globalPadding = (nx+2*border+16)/16;
globalPadding = 16*globalPadding - (nx+2*border);
//printf("Globalpad: %i\n", globalPadding);

// Change border to add padding
//border = border + globalPadding/2;

// Initiate the matrices for the unknowns in the Euler equations
cpu_ptr_2D rho(nx, ny, border,1);
cpu_ptr_2D E(nx, ny, border,1);
cpu_ptr_2D rho_u(nx, ny, border,1);
cpu_ptr_2D rho_v(nx, ny, border,1);
cpu_ptr_2D zeros(nx, ny, border,1);

// Set initial condition
Config.setIC(rho, rho_u, rho_v, E);

double timeStart = get_wall_time();

// Test 
cpu_ptr_2D rho_dummy(nx, ny, border);
cpu_ptr_2D E_dummy(nx, ny, border);

/*
rho_dummy.xmin = -1.0;
rho_dummy.ymin = -1.0;
E_dummy.xmin = -1.0;
E_dummy.ymin = -1.0;
*/

// Set block and grid sizes
dim3 gridBC = dim3(1, 1, 1);
dim3 blockBC = dim3(BLOCKDIM_BC,1,1);

dim3 gridBlockFlux;
dim3 threadBlockFlux;

dim3 gridBlockRK;
dim3 threadBlockRK;

computeGridBlock(gridBlockFlux, threadBlockFlux, nx + 2*border, ny + 2*border, INNERTILEDIM_X, INNERTILEDIM_Y, BLOCKDIM_X, BLOCKDIM_Y);

computeGridBlock(gridBlockRK, threadBlockRK, nx + 2*border, ny + 2*border, BLOCKDIM_X_RK, BLOCKDIM_Y_RK, BLOCKDIM_X_RK, BLOCKDIM_Y_RK);

int nElements = gridBlockFlux.x*gridBlockFlux.y;

// Allocate memory for the GPU pointers
gpu_ptr_1D L_device(nElements);
gpu_ptr_1D dt_device(1);

gpu_ptr_2D rho_device(nx, ny, border);
gpu_ptr_2D E_device(nx, ny, border);
gpu_ptr_2D rho_u_device(nx, ny, border);
gpu_ptr_2D rho_v_device(nx, ny, border); 

gpu_ptr_2D R0(nx, ny, border);
gpu_ptr_2D R1(nx, ny, border);
gpu_ptr_2D R2(nx, ny, border);
gpu_ptr_2D R3(nx, ny, border);

gpu_ptr_2D Q0(nx, ny, border);
gpu_ptr_2D Q1(nx, ny, border);
gpu_ptr_2D Q2(nx, ny, border);
gpu_ptr_2D Q3(nx, ny, border);

// Allocate pinned memory on host
init_allocate();

// Set BC arguments
set_bc_args(BCArgs[0], rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), nx+2*border, ny+2*border, border);
set_bc_args(BCArgs[1], Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), nx+2*border, ny+2*border, border);
set_bc_args(BCArgs[2], rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), nx+2*border, ny+2*border, border);

// Set FLUX arguments
set_flux_args(fluxArgs[0], L_device.getRawPtr(), rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), R0.getRawPtr(),R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), nx, ny, border, rho.get_dx(), rho.get_dy(), theta, gasGam, INNERTILEDIM_X, INNERTILEDIM_Y);
set_flux_args(fluxArgs[1], L_device.getRawPtr(), Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), R0.getRawPtr(),R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), nx, ny, border, rho.get_dx(), rho.get_dy(), theta, gasGam, INNERTILEDIM_X, INNERTILEDIM_Y);

// Set TIME argument
set_dt_args(dtArgs, L_device.getRawPtr(), dt_device.getRawPtr(), nElements, rho.get_dx(), rho.get_dy(), cfl_number);

// Set Rk arguments
set_rk_args(RKArgs[0], dt_device.getRawPtr(), rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), R0.getRawPtr(), R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), nx, ny, border); 
set_rk_args(RKArgs[1], dt_device.getRawPtr(), Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), R0.getRawPtr(), R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), nx, ny, border); 


L_device.set(FLT_MAX);

/*
R0.upload(zeros.get_ptr()); 
R1.upload(zeros.get_ptr()); 
R2.upload(zeros.get_ptr()); 
R3.upload(zeros.get_ptr()); 

Q0.upload(zeros.get_ptr()); 
Q1.upload(zeros.get_ptr()); 
Q2.upload(zeros.get_ptr()); 
Q3.upload(zeros.get_ptr()); 
*/

R0.set(0,0,0,nx,ny,border); 
R1.set(0,0,0,nx,ny,border); 
R2.set(0,0,0,nx,ny,border); 
R3.set(0,0,0,nx,ny,border); 

Q0.set(0,0,0,nx,ny,border); 
Q1.set(0,0,0,nx,ny,border); 
Q2.set(0,0,0,nx,ny,border); 
Q3.set(0,0,0,nx,ny,border); 


rho_device.upload(rho.get_ptr());
rho_u_device.upload(rho_u.get_ptr());
rho_v_device.upload(rho_v.get_ptr());
E_device.upload(E.get_ptr());

// Update boudries
callCollectiveSetBCPeriodic(gridBC, blockBC, BCArgs[0]);

//Create cuda stream
cudaStream_t stream1;
cudaStreamCreate(&stream1);
cudaEvent_t dt_complete;
cudaEventCreate(&dt_complete);


while (currentTime < timeLength && step < maxStep){	
	
	//RK1	
	//Compute flux
	callFluxKernel(gridBlockFlux, threadBlockFlux, 0, fluxArgs[0]);	
	
	// Compute timestep (based on CFL condition)
	callDtKernel(TIMETHREADS, dtArgs);
	
	cudaMemcpyAsync(dt_host, dt_device.getRawPtr(), sizeof(float), cudaMemcpyDeviceToHost, stream1);
	cudaEventRecord(dt_complete, stream1);

	// Perform RK1 step
	callRKKernel(gridBlockRK, threadBlockRK, 0, RKArgs[0]);
	
	//Update boudries
	callCollectiveSetBCPeriodic(gridBC, blockBC, BCArgs[1]);		

	//RK2
	// Compute flux
	callFluxKernel(gridBlockFlux, threadBlockFlux, 1, fluxArgs[1]);

	//Perform RK2 step
	callRKKernel(gridBlockRK, threadBlockRK, 1, RKArgs[1]);	

	//cudaEventRecord(srteam_sync, srteam1);

	callCollectiveSetBCPeriodic(gridBC, blockBC, BCArgs[2]);

	cudaEventSynchronize(dt_complete);

	step++;	
	currentTime += *dt_host;	
//	printf("Step: %i, current time: %.6f dt:%.6f\n" , step,currentTime, dt_host[0]);

}


//cuProfilerStop();
//cudaProfilerStop();

printf("Elapsed time %.5f", get_wall_time() - timeStart);

E_device.download(E.get_ptr());
rho_u_device.download(rho_u.get_ptr());
rho_v_device.download(rho_v.get_ptr());
rho_device.download(rho_dummy.get_ptr());

rho_dummy.printToFile(rho_file, true, false);


Config.exactSolution(E_dummy, currentTime);
E_dummy.printToFile(E_file, true, false);


float LinfError = Linf(E_dummy, rho_dummy);
float L1Error = L1(E_dummy, rho_dummy); 
float L1Error2 = L1test(E_dummy, rho_dummy);

printf("nx: %i\t Linf error %.9f\t L1 error %.7f L1test erro %.7f", nx, LinfError, L1Error, L1Error2);


printf("nx: %i step: %i, current time: %.6f dt:%.6f\n" , nx, step,currentTime, dt_host[0]); 


/*
cudaMemcpy(L_host, L_device, sizeof(float)*(nElements), cudaMemcpyDeviceToHost);
for (int i =0; i < nElements; i++)
	printf(" %.7f ", L_host[i]); 
*/


printf("%s\n", cudaGetErrorString(cudaGetLastError()));

return(0);
}
Exemplo n.º 12
0
void GranularGPUDataTransferer::CopyCPUToGPUAsync(const void* cpuBuffer, size_t numElements, size_t elementSize, void* gpuBuffer)
{
    PrepareDevice(m_deviceId);
    cudaMemcpyAsync(gpuBuffer, cpuBuffer, numElements * elementSize, cudaMemcpyHostToDevice, GetAssignStream()) || "cudaMemcpyAsync failed";
}
Exemplo n.º 13
0
int main(int argc, char*argv[])
{
	FILE *fp;
	uint16_t i, fsize, pad_size, stream_id;
	char * rtp_pkt;
	uint8_t default_aes_keys[AES_KEY_SIZE], default_ivs[AES_IV_SIZE], default_hmac_keys[HMAC_KEY_SIZE];

	struct  timespec start, end;
#if defined(KERNEL_TEST)
	struct  timespec kernel_start, kernel_end;
#endif


	cudaEvent_t startE, stopE;
    cudaEventCreate(&startE);
    cudaEventCreate(&stopE);


	uint32_t NUM_FLOWS, STREAM_NUM;
	if (argc > 2) {
		NUM_FLOWS = atoi(argv[1]);
		STREAM_NUM = atoi(argv[2]);
	} else {
		NUM_FLOWS = 8192;
		STREAM_NUM = 1;
	}
	//printf ("Num of flows is %d, stream num is %d\n", NUM_FLOWS, STREAM_NUM);

	cudaStream_t stream[STREAM_NUM];
	for (i = 0; i < STREAM_NUM; i ++) {
		cudaStreamCreate(&stream[i]);
	}

	uint8_t * host_in,*device_in[STREAM_NUM];
	uint8_t * host_aes_keys,* device_aes_keys[STREAM_NUM];
	uint8_t * host_ivs,* device_ivs[STREAM_NUM];
	uint8_t * host_hmac_keys,*device_hmac_keys[STREAM_NUM];
	uint32_t * host_pkt_offset,*device_pkt_offset[STREAM_NUM];
	uint16_t * host_actual_length,*device_actual_length[STREAM_NUM];
       
	double diff;
	uint8_t a = 123;

	fp = fopen("rtp.pkt", "rb");
	fseek(fp, 0, SEEK_END);
	// NOTE: fsize should be 1356 bytes
	//fsize = ftell(fp);
	fsize = 1328;
	fseek(fp, 0, SEEK_SET);

	rtp_pkt = (char *)calloc(fsize, sizeof(char));
	fread(rtp_pkt, fsize, sizeof(char), fp);

	pad_size = (fsize + 63 + 9) & (~0x03F);

	//printf("the original package is %d bytes,now we pad it to %d bytes\n", fsize, pad_size);

	for (i = 0; i < AES_KEY_SIZE; i ++)
		default_aes_keys[i] = a;
	for (i = 0; i < AES_IV_SIZE; i ++)
		default_ivs[i] = a;
	for (i = 0; i < HMAC_KEY_SIZE; i ++)
		default_hmac_keys[i] = a;

	//printf("duplicate it %d times, takes %d bytes\n",NUM_FLOWS,pad_size*NUM_FLOWS);
	cudaHostAlloc((void **)&host_in, pad_size * NUM_FLOWS * sizeof(uint8_t), cudaHostAllocDefault);
	cudaHostAlloc((void **)&host_aes_keys, NUM_FLOWS * AES_KEY_SIZE, cudaHostAllocWriteCombined);
	cudaHostAlloc((void **)&host_ivs, NUM_FLOWS * AES_IV_SIZE, cudaHostAllocWriteCombined);
	cudaHostAlloc((void **)&host_hmac_keys, NUM_FLOWS * HMAC_KEY_SIZE, cudaHostAllocWriteCombined);
	cudaHostAlloc((void **)&host_pkt_offset, NUM_FLOWS * PKT_OFFSET_SIZE, cudaHostAllocWriteCombined);
	cudaHostAlloc((void **)&host_actual_length, NUM_FLOWS * PKT_LENGTH_SIZE, cudaHostAllocWriteCombined);

	for (i = 0; i < NUM_FLOWS; i ++){
		memcpy(host_in + i * pad_size, rtp_pkt, fsize * sizeof(uint8_t));
		memcpy((uint8_t *)host_aes_keys + i * AES_KEY_SIZE, default_aes_keys, AES_KEY_SIZE);
		memcpy((uint8_t *)host_ivs + i * AES_IV_SIZE, default_ivs, AES_IV_SIZE);
		memcpy((uint8_t *)host_hmac_keys + i * HMAC_KEY_SIZE, default_hmac_keys, HMAC_KEY_SIZE);
		host_pkt_offset[i] = i * pad_size;
		host_actual_length[i] = fsize;
	}

	for (i = 0; i < STREAM_NUM; i ++) {
		cudaMalloc((void **)&(device_in[i]), pad_size * NUM_FLOWS * sizeof(uint8_t));
		cudaMalloc((void **)&(device_aes_keys[i]), NUM_FLOWS * AES_KEY_SIZE);
		cudaMalloc((void **)&(device_ivs[i]), NUM_FLOWS * AES_IV_SIZE);
		cudaMalloc((void **)&(device_hmac_keys[i]), NUM_FLOWS * HMAC_KEY_SIZE);
		cudaMalloc((void **)&(device_pkt_offset[i]), NUM_FLOWS * PKT_OFFSET_SIZE);
		cudaMalloc((void **)&(device_actual_length[i]), NUM_FLOWS * PKT_LENGTH_SIZE);
	}

	/* warm up */
	for (stream_id = 0; stream_id < STREAM_NUM; stream_id ++) {
		cudaMemcpyAsync(device_in[stream_id], host_in, pad_size * NUM_FLOWS * sizeof(uint8_t), cudaMemcpyHostToDevice, stream[stream_id]);
		cudaMemcpyAsync(device_aes_keys[stream_id], host_aes_keys, NUM_FLOWS * AES_KEY_SIZE, cudaMemcpyHostToDevice, stream[stream_id]);
		cudaMemcpyAsync(device_ivs[stream_id], host_ivs, NUM_FLOWS * AES_IV_SIZE, cudaMemcpyHostToDevice, stream[stream_id]);
		cudaMemcpyAsync(device_hmac_keys[stream_id], host_hmac_keys, NUM_FLOWS * HMAC_KEY_SIZE, cudaMemcpyHostToDevice, stream[stream_id]);
		cudaMemcpyAsync(device_pkt_offset[stream_id], host_pkt_offset, NUM_FLOWS * PKT_OFFSET_SIZE, cudaMemcpyHostToDevice, stream[stream_id]);
		cudaMemcpyAsync(device_actual_length[stream_id], host_actual_length, NUM_FLOWS * PKT_LENGTH_SIZE, cudaMemcpyHostToDevice, stream[stream_id]);

		co_aes_sha1_gpu(
					device_in[stream_id],
					device_in[stream_id],
					device_aes_keys[stream_id],
					device_ivs[stream_id],
					device_hmac_keys[stream_id],
					device_pkt_offset[stream_id],
					device_actual_length[stream_id],
					NUM_FLOWS,
					NULL,
					THREADS_PER_BLK,
					stream[stream_id]);

		cudaDeviceSynchronize();
	}

	/* Real test */
	for (i = 0; i < 1; i ++) {
		clock_gettime(CLOCK_MONOTONIC, &start);
		cudaEventRecord(startE, 0);

		for (stream_id = 0; stream_id < STREAM_NUM; stream_id ++) {

			cudaMemcpyAsync(device_in[stream_id], host_in, pad_size * NUM_FLOWS * sizeof(uint8_t), cudaMemcpyHostToDevice, stream[stream_id]);
			cudaMemcpyAsync(device_aes_keys[stream_id], host_aes_keys, NUM_FLOWS * AES_KEY_SIZE, cudaMemcpyHostToDevice, stream[stream_id]);
			cudaMemcpyAsync(device_ivs[stream_id], host_ivs, NUM_FLOWS * AES_IV_SIZE, cudaMemcpyHostToDevice, stream[stream_id]);
			cudaMemcpyAsync(device_hmac_keys[stream_id], host_hmac_keys, NUM_FLOWS * HMAC_KEY_SIZE, cudaMemcpyHostToDevice, stream[stream_id]);
			cudaMemcpyAsync(device_pkt_offset[stream_id], host_pkt_offset, NUM_FLOWS * PKT_OFFSET_SIZE, cudaMemcpyHostToDevice, stream[stream_id]);
			cudaMemcpyAsync(device_actual_length[stream_id], host_actual_length, NUM_FLOWS * PKT_LENGTH_SIZE, cudaMemcpyHostToDevice, stream[stream_id]);

#if defined(KERNEL_TEST)
			cudaDeviceSynchronize();
			clock_gettime(CLOCK_MONOTONIC, &kernel_start);
			//gettimeofday(&kernel_start, NULL);
#endif
			co_aes_sha1_gpu(
					device_in[stream_id],
					device_in[stream_id],
					device_aes_keys[stream_id],
					device_ivs[stream_id],
					device_hmac_keys[stream_id],
					device_pkt_offset[stream_id],
					device_actual_length[stream_id],
					NUM_FLOWS,
					NULL,
					THREADS_PER_BLK,
					stream[stream_id]);
#if defined(KERNEL_TEST)
			cudaDeviceSynchronize();
			clock_gettime(CLOCK_MONOTONIC, &kernel_end);
			//gettimeofday(&kernel_end, NULL);
#endif
			cudaMemcpyAsync(host_in, device_in[stream_id], pad_size * NUM_FLOWS * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream[stream_id]);
		}

		cudaDeviceSynchronize();
		clock_gettime(CLOCK_MONOTONIC, &end);

		cudaEventRecord(stopE, 0);
		cudaEventSynchronize(stopE);
		float time;
		cudaEventElapsedTime(&time, startE, stopE);
		//printf("event speed is ------- %f Gbps\n", (fsize * 8 * NUM_FLOWS * STREAM_NUM * 1e-6)/time);

#if defined(KERNEL_TEST)
		diff = 1000000 * (kernel_end.tv_sec-kernel_start.tv_sec)+ (kernel_end.tv_nsec-kernel_start.tv_nsec)/1000;
		printf("Only Kernel, the difference is %lf ms, speed is %lf Mbps\n", (double)diff/1000, (double)((fsize * 8) * NUM_FLOWS * STREAM_NUM) / diff);
#else
		diff = 1000000 * (end.tv_sec-start.tv_sec)+ (end.tv_nsec-start.tv_nsec)/1000;
		printf("%lf\n", (double)diff/1000);
		//printf("%lfms,%lf Mbps\n", (double)diff/1000, (double)((fsize * 8) * NUM_FLOWS * STREAM_NUM) / diff);
#endif
	}

	return 0;
}
Exemplo n.º 14
0
To reduce_all(CParam<Ti> in, bool change_nan, double nanval) {
    int in_elements = in.dims[0] * in.dims[1] * in.dims[2] * in.dims[3];
    bool is_linear  = (in.strides[0] == 1);
    for (int k = 1; k < 4; k++) {
        is_linear &= (in.strides[k] == (in.strides[k - 1] * in.dims[k - 1]));
    }

    // FIXME: Use better heuristics to get to the optimum number
    if (in_elements > 4096 || !is_linear) {
        if (is_linear) {
            in.dims[0] = in_elements;
            for (int k = 1; k < 4; k++) {
                in.dims[k]    = 1;
                in.strides[k] = in_elements;
            }
        }

        uint threads_x = nextpow2(std::max(32u, (uint)in.dims[0]));
        threads_x      = std::min(threads_x, THREADS_PER_BLOCK);
        uint threads_y = THREADS_PER_BLOCK / threads_x;

        Param<To> tmp;

        uint blocks_x = divup(in.dims[0], threads_x * REPEAT);
        uint blocks_y = divup(in.dims[1], threads_y);

        tmp.dims[0]    = blocks_x;
        tmp.strides[0] = 1;

        for (int k = 1; k < 4; k++) {
            tmp.dims[k]    = in.dims[k];
            tmp.strides[k] = tmp.dims[k - 1] * tmp.strides[k - 1];
        }

        int tmp_elements = tmp.strides[3] * tmp.dims[3];

        auto tmp_alloc = memAlloc<To>(tmp_elements);
        tmp.ptr        = tmp_alloc.get();
        reduce_first_launcher<Ti, To, op>(tmp, in, blocks_x, blocks_y,
                                          threads_x, change_nan, nanval);

        std::vector<To> h_data(tmp_elements);
        CUDA_CHECK(
            cudaMemcpyAsync(h_data.data(), tmp.ptr, tmp_elements * sizeof(To),
                            cudaMemcpyDeviceToHost, cuda::getActiveStream()));
        CUDA_CHECK(cudaStreamSynchronize(cuda::getActiveStream()));

        Binary<To, op> reduce;
        To out = Binary<To, op>::init();
        for (int i = 0; i < tmp_elements; i++) { out = reduce(out, h_data[i]); }

        return out;
    } else {
        std::vector<Ti> h_data(in_elements);
        CUDA_CHECK(
            cudaMemcpyAsync(h_data.data(), in.ptr, in_elements * sizeof(Ti),
                            cudaMemcpyDeviceToHost, cuda::getActiveStream()));
        CUDA_CHECK(cudaStreamSynchronize(cuda::getActiveStream()));

        Transform<Ti, To, op> transform;
        Binary<To, op> reduce;
        To out       = Binary<To, op>::init();
        To nanval_to = scalar<To>(nanval);

        for (int i = 0; i < in_elements; i++) {
            To in_val = transform(h_data[i]);
            if (change_nan) in_val = !IS_NAN(in_val) ? in_val : nanval_to;
            out = reduce(out, in_val);
        }

        return out;
    }
}
Exemplo n.º 15
0
static void gpu_memcpy_async(void *dst, void *src, size_t size, void *async_id)
{
  cudaStream_t st = *((cudaStream_t*)async_id);
  CUDA_SAFE_CALL(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, st));
}
Exemplo n.º 16
0
DeepCopy<HostSpace,CudaSpace,Cuda>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n )
{ CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.cuda_stream() ) ); }
Exemplo n.º 17
0
void DeepCopyAsyncCuda( void * dst , const void * src , size_t n) {
  cudaStream_t s = get_deep_copy_stream();
  CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , s ) );
  cudaStreamSynchronize(s);
}