int TEMPLATE2 (CHOLMOD (gpu_updateC)) ( Int ndrow1, /* C is ndrow2-by-ndrow2 */ Int ndrow2, Int ndrow, /* leading dimension of Lx */ Int ndcol, /* L1 is ndrow1-by-ndcol */ Int nsrow, Int pdx1, /* L1 starts at Lx + L_ENTRY*pdx1 */ /* L2 starts at Lx + L_ENTRY*(pdx1 + ndrow1) */ Int pdi1, double *Lx, double *C, cholmod_common *Common, cholmod_gpu_pointers *gpu_p ) { double *devPtrLx, *devPtrC ; double alpha, beta ; cublasStatus_t cublasStatus ; cudaError_t cudaStat [2] ; Int ndrow3 ; int icol, irow; int iHostBuff, iDevBuff ; #ifndef NTIMER double tstart = 0; #endif if ((ndrow2*L_ENTRY < CHOLMOD_ND_ROW_LIMIT) || (ndcol*L_ENTRY < CHOLMOD_ND_COL_LIMIT)) { /* too small for the CUDA BLAS; use the CPU instead */ return (0) ; } ndrow3 = ndrow2 - ndrow1 ; #ifndef NTIMER Common->syrkStart = SuiteSparse_time ( ) ; Common->CHOLMOD_GPU_SYRK_CALLS++ ; #endif /* ---------------------------------------------------------------------- */ /* allocate workspace on the GPU */ /* ---------------------------------------------------------------------- */ iHostBuff = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS; iDevBuff = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS; /* cycle the device Lx buffer, d_Lx, through CHOLMOD_DEVICE_STREAMS, usually 2, so we can overlap the copy of this descendent supernode with the compute of the previous descendant supernode */ devPtrLx = (double *)(gpu_p->d_Lx[iDevBuff]); /* very little overlap between kernels for difference descendant supernodes (since we enforce the supernodes must be large enough to fill the device) so we only need one C buffer */ devPtrC = (double *)(gpu_p->d_C); /* ---------------------------------------------------------------------- */ /* copy Lx to the GPU */ /* ---------------------------------------------------------------------- */ /* copy host data to pinned buffer first for better H2D bandwidth */ #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) if (ndcol > 32) for ( icol=0; icol<ndcol; icol++ ) { for ( irow=0; irow<ndrow2*L_ENTRY; irow++ ) { gpu_p->h_Lx[iHostBuff][icol*ndrow2*L_ENTRY+irow] = Lx[pdx1*L_ENTRY+icol*ndrow*L_ENTRY + irow]; } } cudaStat[0] = cudaMemcpyAsync ( devPtrLx, gpu_p->h_Lx[iHostBuff], ndrow2*ndcol*L_ENTRY*sizeof(devPtrLx[0]), cudaMemcpyHostToDevice, Common->gpuStream[iDevBuff] ); if ( cudaStat[0] ) { CHOLMOD_GPU_PRINTF ((" ERROR cudaMemcpyAsync = %d \n", cudaStat[0])); return (0); } /* make the current stream wait for kernels in previous streams */ cudaStreamWaitEvent ( Common->gpuStream[iDevBuff], Common->updateCKernelsComplete, 0 ) ; /* ---------------------------------------------------------------------- */ /* create the relative map for this descendant supernode */ /* ---------------------------------------------------------------------- */ createRelativeMapOnDevice ( (Int *)(gpu_p->d_Map), (Int *)(gpu_p->d_Ls), (Int *)(gpu_p->d_RelativeMap), pdi1, ndrow2, &(Common->gpuStream[iDevBuff]) ); /* ---------------------------------------------------------------------- */ /* do the CUDA SYRK */ /* ---------------------------------------------------------------------- */ cublasStatus = cublasSetStream (Common->cublasHandle, Common->gpuStream[iDevBuff]) ; if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream") ; } alpha = 1.0 ; beta = 0.0 ; #ifdef REAL cublasStatus = cublasDsyrk (Common->cublasHandle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, (int) ndrow1, (int) ndcol, /* N, K: L1 is ndrow1-by-ndcol */ &alpha, /* ALPHA: 1 */ devPtrLx, ndrow2, /* A, LDA: L1, ndrow2 */ &beta, /* BETA: 0 */ devPtrC, ndrow2) ; /* C, LDC: C1 */ #else cublasStatus = cublasZherk (Common->cublasHandle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, (int) ndrow1, (int) ndcol, /* N, K: L1 is ndrow1-by-ndcol*/ &alpha, /* ALPHA: 1 */ (const cuDoubleComplex *) devPtrLx, ndrow2, /* A, LDA: L1, ndrow2 */ &beta, /* BETA: 0 */ (cuDoubleComplex *) devPtrC, ndrow2) ; /* C, LDC: C1 */ #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } #ifndef NTIMER Common->CHOLMOD_GPU_SYRK_TIME += SuiteSparse_time() - Common->syrkStart; #endif /* ---------------------------------------------------------------------- */ /* compute remaining (ndrow2-ndrow1)-by-ndrow1 block of C, C2 = L2*L1' */ /* ---------------------------------------------------------------------- */ #ifndef NTIMER Common->CHOLMOD_GPU_GEMM_CALLS++ ; tstart = SuiteSparse_time(); #endif if (ndrow3 > 0) { #ifndef REAL cuDoubleComplex calpha = {1.0,0.0} ; cuDoubleComplex cbeta = {0.0,0.0} ; #endif /* ------------------------------------------------------------------ */ /* do the CUDA BLAS dgemm */ /* ------------------------------------------------------------------ */ #ifdef REAL alpha = 1.0 ; beta = 0.0 ; cublasStatus = cublasDgemm (Common->cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, ndrow3, ndrow1, ndcol, /* M, N, K */ &alpha, /* ALPHA: 1 */ devPtrLx + L_ENTRY*(ndrow1), /* A, LDA: L2*/ ndrow2, /* ndrow */ devPtrLx, /* B, LDB: L1 */ ndrow2, /* ndrow */ &beta, /* BETA: 0 */ devPtrC + L_ENTRY*ndrow1, /* C, LDC: C2 */ ndrow2) ; #else cublasStatus = cublasZgemm (Common->cublasHandle, CUBLAS_OP_N, CUBLAS_OP_C, ndrow3, ndrow1, ndcol, /* M, N, K */ &calpha, /* ALPHA: 1 */ (const cuDoubleComplex*) devPtrLx + ndrow1, ndrow2, /* ndrow */ (const cuDoubleComplex *) devPtrLx, ndrow2, /* ndrow */ &cbeta, /* BETA: 0 */ (cuDoubleComplex *)devPtrC + ndrow1, ndrow2) ; #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } } #ifndef NTIMER Common->CHOLMOD_GPU_GEMM_TIME += SuiteSparse_time() - tstart; #endif /* ------------------------------------------------------------------ */ /* Assemble the update C on the device using the d_RelativeMap */ /* ------------------------------------------------------------------ */ #ifdef REAL addUpdateOnDevice ( gpu_p->d_A[0], devPtrC, gpu_p->d_RelativeMap, ndrow1, ndrow2, nsrow, &(Common->gpuStream[iDevBuff]) ); #else addComplexUpdateOnDevice ( gpu_p->d_A[0], devPtrC, gpu_p->d_RelativeMap, ndrow1, ndrow2, nsrow, &(Common->gpuStream[iDevBuff]) ); #endif /* Record an event indicating that kernels for this descendant are complete */ cudaEventRecord ( Common->updateCKernelsComplete, Common->gpuStream[iDevBuff]); cudaEventRecord ( Common->updateCBuffersFree[iHostBuff], Common->gpuStream[iDevBuff]); return (1) ; }
void TEMPLATE2 (CHOLMOD (gpu_final_assembly)) ( cholmod_common *Common, double *Lx, Int psx, Int nscol, Int nsrow, int supernodeUsedGPU, int *iHostBuff, int *iDevBuff, cholmod_gpu_pointers *gpu_p ) { Int iidx, i, j; Int iHostBuff2 ; Int iDevBuff2 ; if ( supernodeUsedGPU ) { /* ------------------------------------------------------------------ */ /* Apply all of the Shur-complement updates, computed on the gpu, to */ /* the supernode. */ /* ------------------------------------------------------------------ */ *iHostBuff = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS; *iDevBuff = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS; if ( nscol * L_ENTRY >= CHOLMOD_POTRF_LIMIT ) { /* If this supernode is going to be factored using the GPU (potrf) * then it will need the portion of the update assembled ont the * CPU. So copy that to a pinned buffer an H2D copy to device. */ /* wait until a buffer is free */ cudaEventSynchronize ( Common->updateCBuffersFree[*iHostBuff] ); /* copy update assembled on CPU to a pinned buffer */ #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \ private(iidx) if (nscol>32) for ( j=0; j<nscol; j++ ) { for ( i=j; i<nsrow*L_ENTRY; i++ ) { iidx = j*nsrow*L_ENTRY+i; gpu_p->h_Lx[*iHostBuff][iidx] = Lx[psx*L_ENTRY+iidx]; } } /* H2D transfer of update assembled on CPU */ cudaMemcpyAsync ( gpu_p->d_A[1], gpu_p->h_Lx[*iHostBuff], nscol*nsrow*L_ENTRY*sizeof(double), cudaMemcpyHostToDevice, Common->gpuStream[*iDevBuff] ); } Common->ibuffer++; iHostBuff2 = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS; iDevBuff2 = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS; /* wait for all kernels to complete */ cudaEventSynchronize( Common->updateCKernelsComplete ); /* copy assembled Schur-complement updates computed on GPU */ cudaMemcpyAsync ( gpu_p->h_Lx[iHostBuff2], gpu_p->d_A[0], nscol*nsrow*L_ENTRY*sizeof(double), cudaMemcpyDeviceToHost, Common->gpuStream[iDevBuff2] ); if ( nscol * L_ENTRY >= CHOLMOD_POTRF_LIMIT ) { /* with the current implementation, potrf still uses data from the * CPU - so put the fully assembled supernode in a pinned buffer for * fastest access */ /* need both H2D and D2H copies to be complete */ cudaDeviceSynchronize(); /* sum updates from cpu and device on device */ #ifdef REAL sumAOnDevice ( gpu_p->d_A[1], gpu_p->d_A[0], -1.0, nsrow, nscol ); #else sumComplexAOnDevice ( gpu_p->d_A[1], gpu_p->d_A[0], -1.0, nsrow, nscol ); #endif /* place final assembled supernode in pinned buffer */ #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \ private(iidx) if (nscol>32) for ( j=0; j<nscol; j++ ) { for ( i=j*L_ENTRY; i<nscol*L_ENTRY; i++ ) { iidx = j*nsrow*L_ENTRY+i; gpu_p->h_Lx[*iHostBuff][iidx] -= gpu_p->h_Lx[iHostBuff2][iidx]; } } } else { /* assemble with CPU updates */ cudaDeviceSynchronize(); #pragma omp parallel for num_threads(CHOLMOD_OMP_NUM_THREADS) \ private(iidx) if (nscol>32) for ( j=0; j<nscol; j++ ) { for ( i=j*L_ENTRY; i<nsrow*L_ENTRY; i++ ) { iidx = j*nsrow*L_ENTRY+i; Lx[psx*L_ENTRY+iidx] -= gpu_p->h_Lx[iHostBuff2][iidx]; } } } } return; }
void GpuDevice::DoCopyRemoteData(float* dst, float* src, size_t size, int thrid) { CUDA_CALL(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, impl_->stream[thrid])); CUDA_CALL(cudaStreamSynchronize(impl_->stream[thrid])); }
int main( int argc, char*argv[] ) { float *p_old,*p_new,*p_tmp; int n,nn; float gosa,gflops,thruput,thruput2; double time_start,time_max,target,bytes; cudaStream_t stream_top,stream_btm; NP=1; gpu=0; ME=0; target= 60.0; omega= 0.8f; imax = MIMAX-1; jmax = MJMAX-1; kmax = MKMAX-1; imax_global = NP*(imax-2)+2; nn = ITERS; if(ME==0) { printf("\n mimax = %d mjmax = %d mkmax = %d pitch = %d\n",MIMAX, MJMAX, MKMAX, PITCH); printf(" imax = %d jmax = %d kmax = %d\n",imax_global,jmax,kmax); printf(" gridX = %d gridY = %d blockX = %d blockY = %d\n", GRID_X, GRID_Y, BLOCK_X, BLOCK_Y); } //printf("There are %d processes, I am process# %d using GPU %d\n",NP,ME,gpu); CUDA_SAFE_CALL(cudaSetDevice(gpu)); stream_top = 0; stream_btm = 0; #if (CUDART_VERSION >= 3000) { #if (CUDART_VERSION > 3000) struct cudaDeviceProp prop; // display ECC configuration, only queryable post r3.0 CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, gpu)); printf (" ECC on GPU %d is %s\n", gpu, prop.ECCEnabled ? "ON" : "OFF"); #endif /* CUDART_VERSION > 3000 */ // configure kernels for large shared memory to get better occupancy printf (" Configuring GPU L1 cache size ...\n"); set_kernel_cache_config (cudaFuncCachePreferShared); } #endif /* CUDART_VERSION >= 3000 */ CUDA_SAFE_CALL(cudaStreamCreate(&stream_top)); CUDA_SAFE_CALL(cudaStreamCreate(&stream_btm)); if(ME==0) printf(" Allocating Memory...\n"); allocate_memory(); if(ME==0) printf(" Initializing Data...\n\n"); initmt(); if(ME==0) { printf(" Now, start GPU measurement process.\n"); printf(" The loop will be excuted %d times\n",nn); printf(" Wait for a while\n\n"); } time_start = wallclock(); gosa = 0.0f; p_new = p2_d; p_old = p1_d; for(n=0 ; n<nn; n++) { //swap pointers p_tmp = p_new; p_new = p_old; p_old = p_tmp; jacobi_GPU_btm_even (stream_btm,a0_d,a1_d,a2_d,a3_d,b0_d,b1_d,b2_d,c0_d, c1_d,c2_d,wrk_d,bnd_d,p_old,p_new,gosa_d,omega,n); cudaMemcpyAsync (gosa_btm, gosa_d, sizeof(float), cudaMemcpyDeviceToHost, stream_btm); // Since we want to print intermediate values of gosa every PRINT_ITER // iterations, we need to synchronize before picking up the asynchronously // updated value. if (!(n % PRINT_ITER)) { cudaStreamSynchronize(stream_btm); gosa = *gosa_btm; } if(ME==0 && n%PRINT_ITER==0) printf(" iter: %d \tgosa: %e\n",n,gosa); } cudaThreadSynchronize(); gosa = *gosa_btm; time_max = wallclock() - time_start; gflops = (float)(34.0*( (double)nn*(double)(imax_global-2)*(double)(jmax-2)*(double)(kmax-2) ) / time_max * 1e-9); bytes = NP*((double)nn*(56.0*(imax-2)+8.0)*(double)(jmax)*(double)(kmax)); thruput = (float)(bytes / time_max / 1024.0 / 1024.0 / 1024.0); thruput2 = (float)(bytes / time_max / 1e9); if(ME==0) { printf(" \nLoop executed for %d times\n",nn); printf(" Gosa : %e \n",gosa); printf(" total Compute : %4.1f GFLOPS\ttime : %f seconds\n",gflops,time_max); printf(" total Bandwidth : %4.1f GB/s\n", thruput); printf(" total Bandwidth : %4.1f GB/s (STREAM equivalent)\n",thruput2); printf(" Score based on Pentium III 600MHz : %f\n\n",1000.0*gflops/82.0); } cleanup(); CUDA_SAFE_CALL(cudaStreamDestroy(stream_top)); CUDA_SAFE_CALL(cudaStreamDestroy(stream_btm)); //check_results(); return (EXIT_SUCCESS); }
/* created thread, all this calls are in the thread context */ void *upro_gpu_worker_main(upro_gpu_worker_context_t *context) { upro_timer_t t, loopcounter; upro_log_t log; int i, id = 0, start = 0; double elapsed_time; upro_batch_buf_t *buf; assert(config->cpu_worker_num <= 16); cudaStream_t stream[MAX_GPU_STREAM]; for (i = 0; i < MAX_GPU_STREAM; i ++) { cudaStreamCreate(&stream[i]); } /* Init timers */ upro_timer_init(&t); // For separate events upro_timer_init(&counter); // For the whole program upro_timer_init(&loopcounter); // For each loop upro_log_init(&log); /* Initialize GPU worker, we wait for that all CPU workers have been initialized * then we can init GPU worker with the batches of CPU worker */ upro_gpu_worker_t g; upro_gpu_worker_init(&g, context); printf("GPU Worker is working on core %d ...\n", context->core_id); /* Timers for each kernel launch */ upro_timer_restart(&loopcounter); i = 0; for (;;) { upro_log_loop_marker(&log); i ++; if (upro_unlikely(i == 300)) { printf("-------------------------------------------------------------------------------------------------\n"); upro_timer_restart(&counter); total_packets = 0; } ////////////////////////////////////////// /* This is a CPU/GPU synchronization point */ do { elapsed_time = upro_timer_get_elapsed_time(&loopcounter); if (elapsed_time - config->I > 0.01) { // surpassed the time point more than 1 ms upro_log_msg(&log, "\n%s %lf\n", "--- [GPU Worker] Time point lost! : ", elapsed_time); // assert(0); } } while ((double)(config->I) - elapsed_time > 0.01); upro_log_msg(&log, "%s %lf\n", "--- [GPU Worker] Time point arrived : ", elapsed_time); upro_timer_restart(&loopcounter); ////////////////////////////////////////// /* Get Input Buffer from CPU Workers */ upro_gpu_get_batch(&g, context->cpu_batch_set); upro_timer_restart(&t); for (id = 0; id < config->cpu_worker_num; id ++) { buf = g.bufs[g.cur_buf_id][id]; total_packets += buf->job_num; #if defined(NOT_GPU) printf("%d,", buf->job_num); continue; #endif if (buf->job_num == 0) { printf("%d,", buf->job_num); continue; } else { printf("%d,", buf->job_num); /* if (upro_unlikely(start == 0)) { upro_timer_restart(&counter); start = 1; }*/ } // FOR DEBUG /* { int j; for (j = 0; j < buf->job_num; j ++) { assert(((uint16_t *)(buf->length_pos))[j] == 1328); assert(((uint32_t *)(buf->pkt_offset_pos))[j] == 1344 * j); uint64_t a = *(uint32_t *) ((uint8_t *)buf->input_buf + 1344 * j); assert(a == 0x01006080); } } */ #if defined(TRANSFER_SEPERATE) cudaMemcpyAsync(buf->input_buf_d, buf->input_buf, buf->buf_length, cudaMemcpyHostToDevice, stream[id]); cudaMemcpyAsync(buf->aes_key_pos_d, buf->aes_key_pos, AES_KEY_SIZE * buf->job_num, cudaMemcpyHostToDevice, stream[id]); cudaMemcpyAsync(buf->aes_iv_pos_d, buf->aes_iv_pos, AES_IV_SIZE * buf->job_num, cudaMemcpyHostToDevice, stream[id]); cudaMemcpyAsync(buf->pkt_offset_pos_d, buf->pkt_offset_pos, PKT_OFFSET_SIZE * buf->job_num, cudaMemcpyHostToDevice, stream[id]); cudaMemcpyAsync(buf->length_pos_d, buf->length_pos, PKT_LENGTH_SIZE * buf->job_num, cudaMemcpyHostToDevice, stream[id]); cudaMemcpyAsync(buf->hmac_key_pos_d, buf->hmac_key_pos, HMAC_KEY_SIZE * buf->job_num, cudaMemcpyHostToDevice, stream[id]); #else cudaMemcpyAsync(buf->input_buf_d, buf->input_buf, alloc_size, cudaMemcpyHostToDevice, stream[id]); #endif co_aes_sha1_gpu ( buf->input_buf_d, buf->input_buf_d, // output_buf = input_buf, we do not allocate output now buf->aes_key_pos_d, buf->aes_iv_pos_d, buf->hmac_key_pos_d, buf->pkt_offset_pos_d, buf->length_pos_d, buf->job_num, NULL, 256, // the library requires to initialize the T-box stream[id]); cudaMemcpyAsync(buf->input_buf, buf->input_buf_d, buf->buf_length, cudaMemcpyDeviceToHost, stream[id]); } cudaDeviceSynchronize(); upro_timer_stop(&t); upro_log_msg(&log, "\n%s %lf ms\n", "--- [GPU Worker] Execution Time :", upro_timer_get_total_time(&t)); /* Tell the forwarders that this batch has been processed */ upro_gpu_give_to_forwarder(&g, context->cpu_batch_set); } upro_timer_stop(&counter); printf("End of execution, now the program costs : %f ms\n", upro_timer_get_total_time(&counter)); // printf("Processing speed is %.2f Mbps\n", (bytes * 8) / (1e3 * upro_timer_get_total_time(&counter))); // upro_log_print(&log); return 0; }
cudaError_t WINAPI wine_cudaMemcpyAsync( void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream ) { WINE_TRACE("\n"); return cudaMemcpyAsync( dst, src, count, kind, stream ); }
gaspi_return_t pgaspi_gpu_write_notify(const gaspi_segment_id_t segment_id_local, const gaspi_offset_t offset_local, const gaspi_rank_t rank, const gaspi_segment_id_t segment_id_remote, const gaspi_offset_t offset_remote, const gaspi_size_t size, const gaspi_notification_id_t notification_id, const gaspi_notification_t notification_value, const gaspi_queue_id_t queue, const gaspi_timeout_t timeout_ms) { if(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId < 0 || size <= GASPI_GPU_DIRECT_MAX ) { return gaspi_write_notify(segment_id_local, offset_local, rank, segment_id_remote, offset_remote, size,notification_id, notification_value, queue, timeout_ms); } if(lock_gaspi_tout (&glb_gaspi_ctx.lockC[queue], timeout_ms)) return GASPI_TIMEOUT; char *host_ptr = (char*)(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].host_ptr+NOTIFY_OFFSET+offset_local); char* device_ptr =(char*)(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].addr+offset_local); gaspi_gpu* agpu = _gaspi_find_gpu(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId); if( !agpu ) { gaspi_print_error("No GPU found or not initialized (gaspi_init_GPUs)."); unlock_gaspi(&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } int copy_size = 0; int gpu_offset = 0; int size_left = size; int BLOCK_SIZE= GASPI_GPU_BUFFERED; const gaspi_cycles_t s0 = gaspi_get_cycles (); while(size_left > 0) { int i; for(i = 0; i < GASPI_CUDA_EVENTS; i++) { if(size_left > BLOCK_SIZE) copy_size = BLOCK_SIZE; else copy_size = size_left; if(cudaMemcpyAsync(host_ptr+gpu_offset, device_ptr + gpu_offset, copy_size, cudaMemcpyDeviceToHost, agpu->streams[queue])) { unlock_gaspi(&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } glb_gaspi_ctx.ne_count_c[queue]++; agpu->events[queue][i].segment_remote = segment_id_remote; agpu->events[queue][i].segment_local = segment_id_local; agpu->events[queue][i].size = copy_size; agpu->events[queue][i].rank = rank; agpu->events[queue][i].offset_local = offset_local+gpu_offset; agpu->events[queue][i].offset_remote = offset_remote+gpu_offset; agpu->events[queue][i].in_use = 1; cudaError_t err = cudaEventRecord(agpu->events[queue][i].event,agpu->streams[queue]); if(err != cudaSuccess) { unlock_gaspi(&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } /* Thats not beautiful at all, however, else we have a overflow soon in the queue */ if(agpu->events[queue][i].ib_use) { struct ibv_wc wc; int ne; do { ne = ibv_poll_cq (glb_gaspi_ctx_ib.scqC[queue], 1, &wc); glb_gaspi_ctx.ne_count_c[queue] -= ne; if (ne == 0) { const gaspi_cycles_t s1 = gaspi_get_cycles (); const gaspi_cycles_t tdelta = s1 - s0; const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs; if (ms > timeout_ms) { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_TIMEOUT; } } } while(ne == 0); agpu->events[queue][i].ib_use = 0; } gpu_offset += copy_size; size_left -= copy_size; if(size_left == 0) break; } for(i = 0; i < GASPI_CUDA_EVENTS; i++) { cudaError_t error; if (agpu->events[queue][i].in_use == 1 ) { do { error = cudaEventQuery(agpu->events[queue][i].event ); if( cudaSuccess == error ) { if (_gaspi_event_send(&agpu->events[queue][i],queue) ) { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } agpu->events[queue][i].in_use = 0; } else if(error == cudaErrorNotReady) { const gaspi_cycles_t s1 = gaspi_get_cycles (); const gaspi_cycles_t tdelta = s1 - s0; const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs; if (ms > timeout_ms) { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_TIMEOUT; } } else { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } } while(error != cudaSuccess); } } } struct ibv_send_wr *bad_wr; struct ibv_sge slistN; struct ibv_send_wr swrN; slistN.addr = (uintptr_t)(glb_gaspi_ctx.nsrc.buf + notification_id * sizeof(gaspi_notification_id_t)); *((unsigned int *) slistN.addr) = notification_value; slistN.length = sizeof(gaspi_notification_id_t); slistN.lkey =((struct ibv_mr *) glb_gaspi_ctx.nsrc.mr)->lkey; if((glb_gaspi_ctx.rrmd[segment_id_remote][rank].cudaDevId >= 0)) { swrN.wr.rdma.remote_addr = (glb_gaspi_ctx.rrmd[segment_id_remote][rank].host_addr + notification_id * sizeof(gaspi_notification_id_t)); swrN.wr.rdma.rkey = glb_gaspi_ctx.rrmd[segment_id_remote][rank].host_rkey; } else { swrN.wr.rdma.remote_addr = (glb_gaspi_ctx.rrmd[segment_id_remote][rank].addr + notification_id * sizeof(gaspi_notification_id_t)); swrN.wr.rdma.rkey = glb_gaspi_ctx.rrmd[segment_id_remote][rank].rkey; } swrN.sg_list = &slistN; swrN.num_sge = 1; swrN.wr_id = rank; swrN.opcode = IBV_WR_RDMA_WRITE; swrN.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE;; swrN.next = NULL; if (ibv_post_send (glb_gaspi_ctx_ib.qpC[queue][rank], &swrN, &bad_wr)) { glb_gaspi_ctx.qp_state_vec[queue][rank] = GASPI_STATE_CORRUPT; unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } glb_gaspi_ctx.ne_count_c[queue]++; unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_SUCCESS; }
void THCStorage_resize(THCState *state, THCStorage *self, ptrdiff_t size) { THArgCheck(size >= 0, 2, "invalid size"); THAssert(self->allocator != NULL); int device; THCudaCheck(cudaGetDevice(&device)); if(!(self->flag & TH_STORAGE_RESIZABLE)) THError("Trying to resize storage that is not resizable"); size_t elementSize = at::elementSize(self->scalar_type); if (self->allocator->realloc) { void * data_ptr = self->data_ptr; cudaError_t err = (*self->allocator->realloc)( self->allocatorContext, (void**)&(data_ptr), self->size * elementSize, size * elementSize, THCState_getCurrentStreamOnDevice(state, device)); if (err != cudaSuccess) { THCudaCheck(err); } self->size = size; self->device = device; return; } if(size == 0) { if(self->flag & TH_STORAGE_FREEMEM) { THCudaCheck( (*self->allocator->free)(self->allocatorContext, self->data_ptr)); } self->data_ptr = NULL; self->size = 0; self->device = device; } else { void *data = NULL; cudaError_t err = (*self->allocator->malloc)(self->allocatorContext, (void**)&(data), size * elementSize, THCState_getCurrentStreamOnDevice(state, device)); THCudaCheck(err); if (self->data_ptr) { // Enable p2p access when the memcpy is across devices THCState_getPeerToPeerAccess(state, device, self->device); THCudaCheck(cudaMemcpyAsync(data, self->data_ptr, THMin(self->size, size) * elementSize, cudaMemcpyDeviceToDevice, THCState_getCurrentStream(state))); if(self->flag & TH_STORAGE_FREEMEM) { THCudaCheck( (*self->allocator->free)(self->allocatorContext, self->data_ptr)); } } self->data_ptr = data; self->size = size; self->device = device; } }
static void where(Param<uint> &out, CParam<T> in) { uint threads_x = nextpow2(std::max(32u, (uint)in.dims[0])); threads_x = std::min(threads_x, THREADS_PER_BLOCK); uint threads_y = THREADS_PER_BLOCK / threads_x; uint blocks_x = divup(in.dims[0], threads_x * REPEAT); uint blocks_y = divup(in.dims[1], threads_y); Param<uint> rtmp; Param<uint> otmp; rtmp.dims[0] = blocks_x; otmp.dims[0] = in.dims[0]; rtmp.strides[0] = 1; otmp.strides[0] = 1; for (int k = 1; k < 4; k++) { rtmp.dims[k] = in.dims[k]; rtmp.strides[k] = rtmp.strides[k - 1] * rtmp.dims[k - 1]; otmp.dims[k] = in.dims[k]; otmp.strides[k] = otmp.strides[k - 1] * otmp.dims[k - 1]; } int rtmp_elements = rtmp.strides[3] * rtmp.dims[3]; rtmp.ptr = memAlloc<uint>(rtmp_elements); int otmp_elements = otmp.strides[3] * otmp.dims[3]; otmp.ptr = memAlloc<uint>(otmp_elements); scan_first_launcher<T, uint, af_notzero_t, false, true>(otmp, rtmp, in, blocks_x, blocks_y, threads_x); // Linearize the dimensions and perform scan Param<uint> ltmp = rtmp; ltmp.dims[0] = rtmp_elements; for (int k = 1; k < 4; k++) { ltmp.dims[k] = 1; ltmp.strides[k] = rtmp_elements; } scan_first<uint, uint, af_add_t, true>(ltmp, ltmp); // Get output size and allocate output uint total; CUDA_CHECK(cudaMemcpyAsync(&total, rtmp.ptr + rtmp_elements - 1, sizeof(uint), cudaMemcpyDeviceToHost, cuda::getStream(cuda::getActiveDeviceId()))); CUDA_CHECK(cudaStreamSynchronize(cuda::getStream(cuda::getActiveDeviceId()))); out.ptr = memAlloc<uint>(total); out.dims[0] = total; out.strides[0] = 1; for (int k = 1; k < 4; k++) { out.dims[k] = 1; out.strides[k] = total; } dim3 threads(threads_x, THREADS_PER_BLOCK / threads_x); dim3 blocks(blocks_x * in.dims[2], blocks_y * in.dims[3]); uint lim = divup(otmp.dims[0], (threads_x * blocks_x)); CUDA_LAUNCH((get_out_idx<T>), blocks, threads, out.ptr, otmp, rtmp, in, blocks_x, blocks_y, lim); POST_LAUNCH_CHECK(); memFree(rtmp.ptr); memFree(otmp.ptr); }
gaspi_return_t pgaspi_gpu_write(const gaspi_segment_id_t segment_id_local, const gaspi_offset_t offset_local, const gaspi_rank_t rank, const gaspi_segment_id_t segment_id_remote, const gaspi_offset_t offset_remote, const gaspi_size_t size, const gaspi_queue_id_t queue, const gaspi_timeout_t timeout_ms) { if( glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId < 0 || size <= GASPI_GPU_DIRECT_MAX ) { return gaspi_write(segment_id_local, offset_local, rank, segment_id_remote, offset_remote, size, queue, timeout_ms); } if(lock_gaspi_tout (&glb_gaspi_ctx.lockC[queue], timeout_ms)) return GASPI_TIMEOUT; char* host_ptr = (char*)(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].host_ptr + NOTIFY_OFFSET + offset_local); char* device_ptr = (char*)(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].addr + offset_local); gaspi_gpu* agpu = _gaspi_find_gpu(glb_gaspi_ctx.rrmd[segment_id_local][glb_gaspi_ctx.rank].cudaDevId); if( !agpu ) { gaspi_print_error("No GPU found or not initialized (gaspi_init_GPUs)."); return GASPI_ERROR; } int size_left = size; int copy_size = 0; int gpu_offset = 0; const int BLOCK_SIZE = GASPI_GPU_BUFFERED; const gaspi_cycles_t s0 = gaspi_get_cycles (); while(size_left > 0) { int i; for(i = 0; i < GASPI_CUDA_EVENTS; i++) { if(size_left > BLOCK_SIZE) copy_size = BLOCK_SIZE; else copy_size = size_left; if( cudaMemcpyAsync(host_ptr + gpu_offset, device_ptr + gpu_offset, copy_size, cudaMemcpyDeviceToHost, agpu->streams[queue])) { unlock_gaspi(&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } glb_gaspi_ctx.ne_count_c[queue]++; agpu->events[queue][i].segment_remote = segment_id_remote; agpu->events[queue][i].segment_local = segment_id_local; agpu->events[queue][i].size = copy_size; agpu->events[queue][i].rank = rank; agpu->events[queue][i].offset_local = offset_local+gpu_offset; agpu->events[queue][i].offset_remote = offset_remote+gpu_offset; agpu->events[queue][i].in_use =1; cudaError_t err = cudaEventRecord(agpu->events[queue][i].event, agpu->streams[queue]); if(err != cudaSuccess) { glb_gaspi_ctx.qp_state_vec[queue][rank] = GASPI_STATE_CORRUPT; unlock_gaspi(&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } gpu_offset += copy_size; size_left -= copy_size; if(size_left == 0) break; if(agpu->events[queue][i].ib_use) { struct ibv_wc wc; int ne; do { ne = ibv_poll_cq (glb_gaspi_ctx_ib.scqC[queue], 1, &wc); glb_gaspi_ctx.ne_count_c[queue] -= ne; if (ne == 0) { const gaspi_cycles_t s1 = gaspi_get_cycles (); const gaspi_cycles_t tdelta = s1 - s0; const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs; if (ms > timeout_ms) { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_TIMEOUT; } } } while(ne==0); agpu->events[queue][i].ib_use = 0; } } for(i = 0; i < GASPI_CUDA_EVENTS; i++) { cudaError_t error; if ( agpu->events[queue][i].in_use == 1 ) { do { error = cudaEventQuery(agpu->events[queue][i].event ); if( cudaSuccess == error ) { if (_gaspi_event_send(&agpu->events[queue][i],queue)) { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } agpu->events[queue][i].in_use = 0; } else if(error == cudaErrorNotReady) { const gaspi_cycles_t s1 = gaspi_get_cycles (); const gaspi_cycles_t tdelta = s1 - s0; const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs; if (ms > timeout_ms) { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_TIMEOUT; } } else { unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_ERROR; } } while(error != cudaSuccess); } } } unlock_gaspi (&glb_gaspi_ctx.lockC[queue]); return GASPI_SUCCESS; }
int main(int argc,char **argv){ // Print GPU properties //print_properties(); // Files to print the result after the last time step FILE *rho_file; FILE *E_file; rho_file = fopen("rho_final.txt", "w"); E_file = fopen("E_final.txt", "w"); // Construct initial condition for problem ICsinus Config(-1.0, 1.0, -1.0, 1.0); //ICsquare Config(0.5,0.5,gasGam); // Set initial values for Configuration 1 /* Config.set_rho(rhoConfig19); Config.set_pressure(pressureConfig19); Config.set_u(uConfig19); Config.set_v(vConfig19); */ // Determining global border based on left over tiles (a little hack) int globalPadding; globalPadding = (nx+2*border+16)/16; globalPadding = 16*globalPadding - (nx+2*border); //printf("Globalpad: %i\n", globalPadding); // Change border to add padding //border = border + globalPadding/2; // Initiate the matrices for the unknowns in the Euler equations cpu_ptr_2D rho(nx, ny, border,1); cpu_ptr_2D E(nx, ny, border,1); cpu_ptr_2D rho_u(nx, ny, border,1); cpu_ptr_2D rho_v(nx, ny, border,1); cpu_ptr_2D zeros(nx, ny, border,1); // Set initial condition Config.setIC(rho, rho_u, rho_v, E); double timeStart = get_wall_time(); // Test cpu_ptr_2D rho_dummy(nx, ny, border); cpu_ptr_2D E_dummy(nx, ny, border); /* rho_dummy.xmin = -1.0; rho_dummy.ymin = -1.0; E_dummy.xmin = -1.0; E_dummy.ymin = -1.0; */ // Set block and grid sizes dim3 gridBC = dim3(1, 1, 1); dim3 blockBC = dim3(BLOCKDIM_BC,1,1); dim3 gridBlockFlux; dim3 threadBlockFlux; dim3 gridBlockRK; dim3 threadBlockRK; computeGridBlock(gridBlockFlux, threadBlockFlux, nx + 2*border, ny + 2*border, INNERTILEDIM_X, INNERTILEDIM_Y, BLOCKDIM_X, BLOCKDIM_Y); computeGridBlock(gridBlockRK, threadBlockRK, nx + 2*border, ny + 2*border, BLOCKDIM_X_RK, BLOCKDIM_Y_RK, BLOCKDIM_X_RK, BLOCKDIM_Y_RK); int nElements = gridBlockFlux.x*gridBlockFlux.y; // Allocate memory for the GPU pointers gpu_ptr_1D L_device(nElements); gpu_ptr_1D dt_device(1); gpu_ptr_2D rho_device(nx, ny, border); gpu_ptr_2D E_device(nx, ny, border); gpu_ptr_2D rho_u_device(nx, ny, border); gpu_ptr_2D rho_v_device(nx, ny, border); gpu_ptr_2D R0(nx, ny, border); gpu_ptr_2D R1(nx, ny, border); gpu_ptr_2D R2(nx, ny, border); gpu_ptr_2D R3(nx, ny, border); gpu_ptr_2D Q0(nx, ny, border); gpu_ptr_2D Q1(nx, ny, border); gpu_ptr_2D Q2(nx, ny, border); gpu_ptr_2D Q3(nx, ny, border); // Allocate pinned memory on host init_allocate(); // Set BC arguments set_bc_args(BCArgs[0], rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), nx+2*border, ny+2*border, border); set_bc_args(BCArgs[1], Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), nx+2*border, ny+2*border, border); set_bc_args(BCArgs[2], rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), nx+2*border, ny+2*border, border); // Set FLUX arguments set_flux_args(fluxArgs[0], L_device.getRawPtr(), rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), R0.getRawPtr(),R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), nx, ny, border, rho.get_dx(), rho.get_dy(), theta, gasGam, INNERTILEDIM_X, INNERTILEDIM_Y); set_flux_args(fluxArgs[1], L_device.getRawPtr(), Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), R0.getRawPtr(),R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), nx, ny, border, rho.get_dx(), rho.get_dy(), theta, gasGam, INNERTILEDIM_X, INNERTILEDIM_Y); // Set TIME argument set_dt_args(dtArgs, L_device.getRawPtr(), dt_device.getRawPtr(), nElements, rho.get_dx(), rho.get_dy(), cfl_number); // Set Rk arguments set_rk_args(RKArgs[0], dt_device.getRawPtr(), rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), R0.getRawPtr(), R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), nx, ny, border); set_rk_args(RKArgs[1], dt_device.getRawPtr(), Q0.getRawPtr(), Q1.getRawPtr(), Q2.getRawPtr(), Q3.getRawPtr(), R0.getRawPtr(), R1.getRawPtr(), R2.getRawPtr(), R3.getRawPtr(), rho_device.getRawPtr(), rho_u_device.getRawPtr(), rho_v_device.getRawPtr(), E_device.getRawPtr(), nx, ny, border); L_device.set(FLT_MAX); /* R0.upload(zeros.get_ptr()); R1.upload(zeros.get_ptr()); R2.upload(zeros.get_ptr()); R3.upload(zeros.get_ptr()); Q0.upload(zeros.get_ptr()); Q1.upload(zeros.get_ptr()); Q2.upload(zeros.get_ptr()); Q3.upload(zeros.get_ptr()); */ R0.set(0,0,0,nx,ny,border); R1.set(0,0,0,nx,ny,border); R2.set(0,0,0,nx,ny,border); R3.set(0,0,0,nx,ny,border); Q0.set(0,0,0,nx,ny,border); Q1.set(0,0,0,nx,ny,border); Q2.set(0,0,0,nx,ny,border); Q3.set(0,0,0,nx,ny,border); rho_device.upload(rho.get_ptr()); rho_u_device.upload(rho_u.get_ptr()); rho_v_device.upload(rho_v.get_ptr()); E_device.upload(E.get_ptr()); // Update boudries callCollectiveSetBCPeriodic(gridBC, blockBC, BCArgs[0]); //Create cuda stream cudaStream_t stream1; cudaStreamCreate(&stream1); cudaEvent_t dt_complete; cudaEventCreate(&dt_complete); while (currentTime < timeLength && step < maxStep){ //RK1 //Compute flux callFluxKernel(gridBlockFlux, threadBlockFlux, 0, fluxArgs[0]); // Compute timestep (based on CFL condition) callDtKernel(TIMETHREADS, dtArgs); cudaMemcpyAsync(dt_host, dt_device.getRawPtr(), sizeof(float), cudaMemcpyDeviceToHost, stream1); cudaEventRecord(dt_complete, stream1); // Perform RK1 step callRKKernel(gridBlockRK, threadBlockRK, 0, RKArgs[0]); //Update boudries callCollectiveSetBCPeriodic(gridBC, blockBC, BCArgs[1]); //RK2 // Compute flux callFluxKernel(gridBlockFlux, threadBlockFlux, 1, fluxArgs[1]); //Perform RK2 step callRKKernel(gridBlockRK, threadBlockRK, 1, RKArgs[1]); //cudaEventRecord(srteam_sync, srteam1); callCollectiveSetBCPeriodic(gridBC, blockBC, BCArgs[2]); cudaEventSynchronize(dt_complete); step++; currentTime += *dt_host; // printf("Step: %i, current time: %.6f dt:%.6f\n" , step,currentTime, dt_host[0]); } //cuProfilerStop(); //cudaProfilerStop(); printf("Elapsed time %.5f", get_wall_time() - timeStart); E_device.download(E.get_ptr()); rho_u_device.download(rho_u.get_ptr()); rho_v_device.download(rho_v.get_ptr()); rho_device.download(rho_dummy.get_ptr()); rho_dummy.printToFile(rho_file, true, false); Config.exactSolution(E_dummy, currentTime); E_dummy.printToFile(E_file, true, false); float LinfError = Linf(E_dummy, rho_dummy); float L1Error = L1(E_dummy, rho_dummy); float L1Error2 = L1test(E_dummy, rho_dummy); printf("nx: %i\t Linf error %.9f\t L1 error %.7f L1test erro %.7f", nx, LinfError, L1Error, L1Error2); printf("nx: %i step: %i, current time: %.6f dt:%.6f\n" , nx, step,currentTime, dt_host[0]); /* cudaMemcpy(L_host, L_device, sizeof(float)*(nElements), cudaMemcpyDeviceToHost); for (int i =0; i < nElements; i++) printf(" %.7f ", L_host[i]); */ printf("%s\n", cudaGetErrorString(cudaGetLastError())); return(0); }
void GranularGPUDataTransferer::CopyCPUToGPUAsync(const void* cpuBuffer, size_t numElements, size_t elementSize, void* gpuBuffer) { PrepareDevice(m_deviceId); cudaMemcpyAsync(gpuBuffer, cpuBuffer, numElements * elementSize, cudaMemcpyHostToDevice, GetAssignStream()) || "cudaMemcpyAsync failed"; }
int main(int argc, char*argv[]) { FILE *fp; uint16_t i, fsize, pad_size, stream_id; char * rtp_pkt; uint8_t default_aes_keys[AES_KEY_SIZE], default_ivs[AES_IV_SIZE], default_hmac_keys[HMAC_KEY_SIZE]; struct timespec start, end; #if defined(KERNEL_TEST) struct timespec kernel_start, kernel_end; #endif cudaEvent_t startE, stopE; cudaEventCreate(&startE); cudaEventCreate(&stopE); uint32_t NUM_FLOWS, STREAM_NUM; if (argc > 2) { NUM_FLOWS = atoi(argv[1]); STREAM_NUM = atoi(argv[2]); } else { NUM_FLOWS = 8192; STREAM_NUM = 1; } //printf ("Num of flows is %d, stream num is %d\n", NUM_FLOWS, STREAM_NUM); cudaStream_t stream[STREAM_NUM]; for (i = 0; i < STREAM_NUM; i ++) { cudaStreamCreate(&stream[i]); } uint8_t * host_in,*device_in[STREAM_NUM]; uint8_t * host_aes_keys,* device_aes_keys[STREAM_NUM]; uint8_t * host_ivs,* device_ivs[STREAM_NUM]; uint8_t * host_hmac_keys,*device_hmac_keys[STREAM_NUM]; uint32_t * host_pkt_offset,*device_pkt_offset[STREAM_NUM]; uint16_t * host_actual_length,*device_actual_length[STREAM_NUM]; double diff; uint8_t a = 123; fp = fopen("rtp.pkt", "rb"); fseek(fp, 0, SEEK_END); // NOTE: fsize should be 1356 bytes //fsize = ftell(fp); fsize = 1328; fseek(fp, 0, SEEK_SET); rtp_pkt = (char *)calloc(fsize, sizeof(char)); fread(rtp_pkt, fsize, sizeof(char), fp); pad_size = (fsize + 63 + 9) & (~0x03F); //printf("the original package is %d bytes,now we pad it to %d bytes\n", fsize, pad_size); for (i = 0; i < AES_KEY_SIZE; i ++) default_aes_keys[i] = a; for (i = 0; i < AES_IV_SIZE; i ++) default_ivs[i] = a; for (i = 0; i < HMAC_KEY_SIZE; i ++) default_hmac_keys[i] = a; //printf("duplicate it %d times, takes %d bytes\n",NUM_FLOWS,pad_size*NUM_FLOWS); cudaHostAlloc((void **)&host_in, pad_size * NUM_FLOWS * sizeof(uint8_t), cudaHostAllocDefault); cudaHostAlloc((void **)&host_aes_keys, NUM_FLOWS * AES_KEY_SIZE, cudaHostAllocWriteCombined); cudaHostAlloc((void **)&host_ivs, NUM_FLOWS * AES_IV_SIZE, cudaHostAllocWriteCombined); cudaHostAlloc((void **)&host_hmac_keys, NUM_FLOWS * HMAC_KEY_SIZE, cudaHostAllocWriteCombined); cudaHostAlloc((void **)&host_pkt_offset, NUM_FLOWS * PKT_OFFSET_SIZE, cudaHostAllocWriteCombined); cudaHostAlloc((void **)&host_actual_length, NUM_FLOWS * PKT_LENGTH_SIZE, cudaHostAllocWriteCombined); for (i = 0; i < NUM_FLOWS; i ++){ memcpy(host_in + i * pad_size, rtp_pkt, fsize * sizeof(uint8_t)); memcpy((uint8_t *)host_aes_keys + i * AES_KEY_SIZE, default_aes_keys, AES_KEY_SIZE); memcpy((uint8_t *)host_ivs + i * AES_IV_SIZE, default_ivs, AES_IV_SIZE); memcpy((uint8_t *)host_hmac_keys + i * HMAC_KEY_SIZE, default_hmac_keys, HMAC_KEY_SIZE); host_pkt_offset[i] = i * pad_size; host_actual_length[i] = fsize; } for (i = 0; i < STREAM_NUM; i ++) { cudaMalloc((void **)&(device_in[i]), pad_size * NUM_FLOWS * sizeof(uint8_t)); cudaMalloc((void **)&(device_aes_keys[i]), NUM_FLOWS * AES_KEY_SIZE); cudaMalloc((void **)&(device_ivs[i]), NUM_FLOWS * AES_IV_SIZE); cudaMalloc((void **)&(device_hmac_keys[i]), NUM_FLOWS * HMAC_KEY_SIZE); cudaMalloc((void **)&(device_pkt_offset[i]), NUM_FLOWS * PKT_OFFSET_SIZE); cudaMalloc((void **)&(device_actual_length[i]), NUM_FLOWS * PKT_LENGTH_SIZE); } /* warm up */ for (stream_id = 0; stream_id < STREAM_NUM; stream_id ++) { cudaMemcpyAsync(device_in[stream_id], host_in, pad_size * NUM_FLOWS * sizeof(uint8_t), cudaMemcpyHostToDevice, stream[stream_id]); cudaMemcpyAsync(device_aes_keys[stream_id], host_aes_keys, NUM_FLOWS * AES_KEY_SIZE, cudaMemcpyHostToDevice, stream[stream_id]); cudaMemcpyAsync(device_ivs[stream_id], host_ivs, NUM_FLOWS * AES_IV_SIZE, cudaMemcpyHostToDevice, stream[stream_id]); cudaMemcpyAsync(device_hmac_keys[stream_id], host_hmac_keys, NUM_FLOWS * HMAC_KEY_SIZE, cudaMemcpyHostToDevice, stream[stream_id]); cudaMemcpyAsync(device_pkt_offset[stream_id], host_pkt_offset, NUM_FLOWS * PKT_OFFSET_SIZE, cudaMemcpyHostToDevice, stream[stream_id]); cudaMemcpyAsync(device_actual_length[stream_id], host_actual_length, NUM_FLOWS * PKT_LENGTH_SIZE, cudaMemcpyHostToDevice, stream[stream_id]); co_aes_sha1_gpu( device_in[stream_id], device_in[stream_id], device_aes_keys[stream_id], device_ivs[stream_id], device_hmac_keys[stream_id], device_pkt_offset[stream_id], device_actual_length[stream_id], NUM_FLOWS, NULL, THREADS_PER_BLK, stream[stream_id]); cudaDeviceSynchronize(); } /* Real test */ for (i = 0; i < 1; i ++) { clock_gettime(CLOCK_MONOTONIC, &start); cudaEventRecord(startE, 0); for (stream_id = 0; stream_id < STREAM_NUM; stream_id ++) { cudaMemcpyAsync(device_in[stream_id], host_in, pad_size * NUM_FLOWS * sizeof(uint8_t), cudaMemcpyHostToDevice, stream[stream_id]); cudaMemcpyAsync(device_aes_keys[stream_id], host_aes_keys, NUM_FLOWS * AES_KEY_SIZE, cudaMemcpyHostToDevice, stream[stream_id]); cudaMemcpyAsync(device_ivs[stream_id], host_ivs, NUM_FLOWS * AES_IV_SIZE, cudaMemcpyHostToDevice, stream[stream_id]); cudaMemcpyAsync(device_hmac_keys[stream_id], host_hmac_keys, NUM_FLOWS * HMAC_KEY_SIZE, cudaMemcpyHostToDevice, stream[stream_id]); cudaMemcpyAsync(device_pkt_offset[stream_id], host_pkt_offset, NUM_FLOWS * PKT_OFFSET_SIZE, cudaMemcpyHostToDevice, stream[stream_id]); cudaMemcpyAsync(device_actual_length[stream_id], host_actual_length, NUM_FLOWS * PKT_LENGTH_SIZE, cudaMemcpyHostToDevice, stream[stream_id]); #if defined(KERNEL_TEST) cudaDeviceSynchronize(); clock_gettime(CLOCK_MONOTONIC, &kernel_start); //gettimeofday(&kernel_start, NULL); #endif co_aes_sha1_gpu( device_in[stream_id], device_in[stream_id], device_aes_keys[stream_id], device_ivs[stream_id], device_hmac_keys[stream_id], device_pkt_offset[stream_id], device_actual_length[stream_id], NUM_FLOWS, NULL, THREADS_PER_BLK, stream[stream_id]); #if defined(KERNEL_TEST) cudaDeviceSynchronize(); clock_gettime(CLOCK_MONOTONIC, &kernel_end); //gettimeofday(&kernel_end, NULL); #endif cudaMemcpyAsync(host_in, device_in[stream_id], pad_size * NUM_FLOWS * sizeof(uint8_t), cudaMemcpyDeviceToHost, stream[stream_id]); } cudaDeviceSynchronize(); clock_gettime(CLOCK_MONOTONIC, &end); cudaEventRecord(stopE, 0); cudaEventSynchronize(stopE); float time; cudaEventElapsedTime(&time, startE, stopE); //printf("event speed is ------- %f Gbps\n", (fsize * 8 * NUM_FLOWS * STREAM_NUM * 1e-6)/time); #if defined(KERNEL_TEST) diff = 1000000 * (kernel_end.tv_sec-kernel_start.tv_sec)+ (kernel_end.tv_nsec-kernel_start.tv_nsec)/1000; printf("Only Kernel, the difference is %lf ms, speed is %lf Mbps\n", (double)diff/1000, (double)((fsize * 8) * NUM_FLOWS * STREAM_NUM) / diff); #else diff = 1000000 * (end.tv_sec-start.tv_sec)+ (end.tv_nsec-start.tv_nsec)/1000; printf("%lf\n", (double)diff/1000); //printf("%lfms,%lf Mbps\n", (double)diff/1000, (double)((fsize * 8) * NUM_FLOWS * STREAM_NUM) / diff); #endif } return 0; }
To reduce_all(CParam<Ti> in, bool change_nan, double nanval) { int in_elements = in.dims[0] * in.dims[1] * in.dims[2] * in.dims[3]; bool is_linear = (in.strides[0] == 1); for (int k = 1; k < 4; k++) { is_linear &= (in.strides[k] == (in.strides[k - 1] * in.dims[k - 1])); } // FIXME: Use better heuristics to get to the optimum number if (in_elements > 4096 || !is_linear) { if (is_linear) { in.dims[0] = in_elements; for (int k = 1; k < 4; k++) { in.dims[k] = 1; in.strides[k] = in_elements; } } uint threads_x = nextpow2(std::max(32u, (uint)in.dims[0])); threads_x = std::min(threads_x, THREADS_PER_BLOCK); uint threads_y = THREADS_PER_BLOCK / threads_x; Param<To> tmp; uint blocks_x = divup(in.dims[0], threads_x * REPEAT); uint blocks_y = divup(in.dims[1], threads_y); tmp.dims[0] = blocks_x; tmp.strides[0] = 1; for (int k = 1; k < 4; k++) { tmp.dims[k] = in.dims[k]; tmp.strides[k] = tmp.dims[k - 1] * tmp.strides[k - 1]; } int tmp_elements = tmp.strides[3] * tmp.dims[3]; auto tmp_alloc = memAlloc<To>(tmp_elements); tmp.ptr = tmp_alloc.get(); reduce_first_launcher<Ti, To, op>(tmp, in, blocks_x, blocks_y, threads_x, change_nan, nanval); std::vector<To> h_data(tmp_elements); CUDA_CHECK( cudaMemcpyAsync(h_data.data(), tmp.ptr, tmp_elements * sizeof(To), cudaMemcpyDeviceToHost, cuda::getActiveStream())); CUDA_CHECK(cudaStreamSynchronize(cuda::getActiveStream())); Binary<To, op> reduce; To out = Binary<To, op>::init(); for (int i = 0; i < tmp_elements; i++) { out = reduce(out, h_data[i]); } return out; } else { std::vector<Ti> h_data(in_elements); CUDA_CHECK( cudaMemcpyAsync(h_data.data(), in.ptr, in_elements * sizeof(Ti), cudaMemcpyDeviceToHost, cuda::getActiveStream())); CUDA_CHECK(cudaStreamSynchronize(cuda::getActiveStream())); Transform<Ti, To, op> transform; Binary<To, op> reduce; To out = Binary<To, op>::init(); To nanval_to = scalar<To>(nanval); for (int i = 0; i < in_elements; i++) { To in_val = transform(h_data[i]); if (change_nan) in_val = !IS_NAN(in_val) ? in_val : nanval_to; out = reduce(out, in_val); } return out; } }
static void gpu_memcpy_async(void *dst, void *src, size_t size, void *async_id) { cudaStream_t st = *((cudaStream_t*)async_id); CUDA_SAFE_CALL(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, st)); }
DeepCopy<HostSpace,CudaSpace,Cuda>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n ) { CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.cuda_stream() ) ); }
void DeepCopyAsyncCuda( void * dst , const void * src , size_t n) { cudaStream_t s = get_deep_copy_stream(); CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , s ) ); cudaStreamSynchronize(s); }