コード例 #1
0
ファイル: blasx_mem_control.c プロジェクト: 529038378/BLASX
void collect_final_result_dsyrk_syr2k(int *tasks_rs, int *tasks_rs_size, int switcher, cudaStream_t *stream, double** C_dev, int block_dim, int stream_num, int x,int y, int z, int nrowc, int ncolc, int ldc, double *C, enum CBLAS_UPLO Uplo) {
    switcher = 1 - switcher;
    int temp = 0;
    for (temp = tasks_rs_size[switcher]; temp < tasks_rs_size[1-switcher] ; temp++) {
        int prior_task = tasks_rs[temp+stream_num*(1-switcher)];
        int i_pre, k_pre;
        blasx_get_index(prior_task, 0, x, &i_pre, &k_pre, Uplo, x);
        int current_stream = temp;
        int nrowc_dev_pre, ncolc_dev_pre;
        margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre);
        int nrow_offset_c_pre = i_pre*block_dim;
        int ncol_offset_c_pre = k_pre*block_dim;
        double *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
        cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(double), C_dev[current_stream+(1-switcher)*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]);
        cudaStreamSynchronize(stream[current_stream]);
    }
    for (temp = 0; temp < tasks_rs_size[switcher]; temp++) {
        //assume 1-switcher
        int prior_task = tasks_rs[temp+stream_num*(switcher)];
        int i_pre, k_pre;
        blasx_get_index(prior_task, 0, x, &i_pre, &k_pre, Uplo, x);
        int current_stream = temp;
        int nrowc_dev_pre, ncolc_dev_pre;
        margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre);
        int nrow_offset_c_pre = i_pre*block_dim;
        int ncol_offset_c_pre = k_pre*block_dim;
        double *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
        cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(double), C_dev[current_stream+switcher*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]);
        cudaStreamSynchronize(stream[current_stream]);
    }
}
コード例 #2
0
ファイル: blasx_mem_control.c プロジェクト: 529038378/BLASX
void collect_final_result_zgemm(int *tasks_rs, int *tasks_rs_size, int switcher, cudaStream_t *stream, cuDoubleComplex** C_dev, int block_dim, int stream_num, int x, int y, int z, int nrowc, int ncolc, int ldc, cuDoubleComplex *C) {
    switcher = 1 - switcher;
    int temp = 0;
    for (temp = tasks_rs_size[switcher]; temp < tasks_rs_size[1-switcher] ; temp++) {
        int prior_task = tasks_rs[temp+stream_num*(1-switcher)];
        int i_pre = prior_task/(y+1);
        int k_pre = prior_task%(y+1);
        int current_stream = temp;
        int nrowc_dev_pre, ncolc_dev_pre;
        margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre);
        int nrow_offset_c_pre = i_pre*block_dim;
        int ncol_offset_c_pre = k_pre*block_dim;
        cuDoubleComplex *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
        assert( cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(cuDoubleComplex), C_dev[current_stream+(1-switcher)*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]) == CUBLAS_STATUS_SUCCESS );
        assert(cudaStreamSynchronize(stream[current_stream]) == cudaSuccess);
    }
    for (temp = 0; temp < tasks_rs_size[switcher]; temp++) {
        int prior_task = tasks_rs[temp+stream_num*(switcher)];
        int i_pre = prior_task/(y+1);
        int k_pre = prior_task%(y+1);
        int current_stream = temp;
        int nrowc_dev_pre, ncolc_dev_pre;
        margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre);
        int nrow_offset_c_pre = i_pre*block_dim;
        int ncol_offset_c_pre = k_pre*block_dim;
        cuDoubleComplex *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
        assert( cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(cuDoubleComplex), C_dev[current_stream+switcher*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]) == CUBLAS_STATUS_SUCCESS );
        assert(cudaStreamSynchronize(stream[current_stream]) == cudaSuccess);
    }
}
コード例 #3
0
ファイル: cuda.cpp プロジェクト: jcvernaleo/ccminer
cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
{
	cudaError_t result = cudaSuccess;
	if (abort_flag)
		return result;
	if (situation >= 0)
	{
		static std::map<int, tsumarray> tsum;

		double a = 0.95, b = 0.05;
		if (tsum.find(situation) == tsum.end()) { a = 0.5; b = 0.5; } // faster initial convergence

		double tsync = 0.0;
		double tsleep = 0.95 * tsum[situation].value[thr_id];
		if (cudaStreamQuery(stream) == cudaErrorNotReady)
		{
			usleep((useconds_t)(1e6*tsleep));
			struct timeval tv_start, tv_end;
			gettimeofday(&tv_start, NULL);
			result = cudaStreamSynchronize(stream);
			gettimeofday(&tv_end, NULL);
			tsync = 1e-6 * (tv_end.tv_usec-tv_start.tv_usec) + (tv_end.tv_sec-tv_start.tv_sec);
		}
		if (tsync >= 0) tsum[situation].value[thr_id] = a * tsum[situation].value[thr_id] + b * (tsleep+tsync);
	}
	else
		result = cudaStreamSynchronize(stream);
	return result;
}
コード例 #4
0
ファイル: blasx_mem_control.c プロジェクト: 529038378/BLASX
void collect_final_result_dtrsm_mode_1(int *tasks_rs, int *tasks_rs_size, int switcher, int switcher_rs, cudaStream_t *stream, double** buffer_dev, int block_dim, int stream_num, int x, int y, int z, int nrowb, int ncolb, int ldb, double *B, int* switch_tracker) {
    int temp = 0;
    for (temp = tasks_rs_size[switcher_rs]; temp < tasks_rs_size[1-switcher_rs] ; temp++) {
        //            printf("retrieving B[%d, %d] @stream=%d switcher:%d\n", z, tasks_rs[temp+STREAMNUM*(1-switcher_rs)], temp, switcher);
        int row = z;
        int col = tasks_rs[temp+stream_num*(1-switcher_rs)];
        int current_stream = temp;
        int nrow_offset = row*block_dim;
        int ncol_offset = col*block_dim;
        int nrow_dev, ncol_dev;
        margin_adjustment(nrowb, ncolb, block_dim, row, col, &nrow_dev, &ncol_dev);
        double *starting_point = &B[nrow_offset+ncol_offset*ldb];
        cublasGetMatrixAsync(nrow_dev, ncol_dev, sizeof(double), buffer_dev[current_stream+switch_tracker[temp]*stream_num], block_dim, starting_point, ldb, stream[current_stream]);
        cudaStreamSynchronize(stream[current_stream]);
    }
    for (temp = 0; temp < tasks_rs_size[switcher_rs]; temp++) {
        //assume 1-switcher
        //printf("retrieving B[%d, %d] @stream=%d\n", z, tasks_rs[temp+STREAMNUM*switcher_rs], temp);
        int row = z;
        int col = tasks_rs[temp+stream_num*switcher_rs];
        int current_stream = temp;
        int nrow_offset = row*block_dim;
        int ncol_offset = col*block_dim;
        int nrow_dev, ncol_dev;
        margin_adjustment(nrowb, ncolb, block_dim, row, col, &nrow_dev, &ncol_dev);
        double *starting_point = &B[nrow_offset+ncol_offset*ldb];
        cublasGetMatrixAsync(nrow_dev, ncol_dev, sizeof(double), buffer_dev[current_stream+switch_tracker[temp]*stream_num], block_dim, starting_point, ldb, stream[current_stream]);
        cudaStreamSynchronize(stream[current_stream]);
    }
}
コード例 #5
0
JNIEXPORT jdouble JNICALL Java_org_apache_spark_mllib_classification_LogisticRegressionNative_predictPoint
    (JNIEnv *env, jobject obj, jdoubleArray data, jdoubleArray weights, jdouble intercept) {

	// the kernel is written to take multiple data sets and produce a set of results, but we're going
	// to run it as multiple parallel kernels, each producing a single result instead
	double *d_dataBuffer, *d_weightsBuffer, *d_score;
	int dataCount, dataLen, whichGPU;
	jdouble h_score, *h_dataBuffer, *h_weightsBuffer;
	cudaStream_t stream;

	// select a GPU for *this* specific dataset
	whichGPU = get_gpu();
	checkCudaErrors(cudaSetDevice(whichGPU));
	checkCudaErrors(cudaStreamCreate(&stream));

	// get a pointer to the raw input data, pinning them in memory
	dataCount = env->GetArrayLength(data);
	dataLen = dataCount*sizeof(double);
	assert(dataCount == env->GetArrayLength(weights));
	h_dataBuffer = (jdouble*) env->GetPrimitiveArrayCritical(data, 0);
	h_weightsBuffer = (jdouble*) env->GetPrimitiveArrayCritical(weights, 0);

	// copy input data to the GPU memory
	// TODO: It may be better to access host memory directly, skipping the copy.  Investigate.
	checkCudaErrors(mallocBest((void**)&d_dataBuffer, dataLen));
	checkCudaErrors(mallocBest((void**)&d_weightsBuffer, dataLen));
	checkCudaErrors(cudaMemcpyAsync(d_dataBuffer, h_dataBuffer, dataLen, cudaMemcpyHostToDevice, stream));
	checkCudaErrors(cudaMemcpyAsync(d_weightsBuffer, h_weightsBuffer, dataLen, cudaMemcpyHostToDevice, stream));
	// synchronize before unpinning, and also because there is a device-device transfer in predictKernelDevice
	checkCudaErrors(cudaStreamSynchronize(stream));
	// un-pin the host arrays, as we're done with them
	env->ReleasePrimitiveArrayCritical(data, h_dataBuffer, 0);
	env->ReleasePrimitiveArrayCritical(weights, h_weightsBuffer, 0);

	// allocate storage for the result
	checkCudaErrors(mallocBest((void**)&d_score, sizeof(double)));

	// run the kernel, to produce a result
	predictKernelDevice(d_dataBuffer, d_weightsBuffer, intercept, d_score, 1, dataCount, stream);

	checkCudaErrors(cudaStreamSynchronize(stream));

	// copy result back to host
	checkCudaErrors(cudaMemcpyAsync(&h_score, d_score, sizeof(double), cudaMemcpyDeviceToHost, stream));

	checkCudaErrors(cudaStreamSynchronize(stream));

	// Free the GPU buffers
	checkCudaErrors(freeBest(d_dataBuffer));
	checkCudaErrors(freeBest(d_weightsBuffer));
	checkCudaErrors(freeBest(d_score));

	checkCudaErrors(cudaStreamDestroy(stream));

	return h_score;
}
コード例 #6
0
static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args) 
{
	float *sub11;

	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); 

	unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
	unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);

	unsigned long z;

	switch (s)
	{
		case 0:
			for (z = 0; z < nx; z++)
			{
				float pivot;
				pivot = sub11[z+z*ld];
				STARPU_ASSERT(pivot != 0.0f);
		
				STARPU_SSCAL(nx - z - 1, (1.0f/pivot), &sub11[z+(z+1)*ld], ld);
		
				STARPU_SGER(nx - z - 1, nx - z - 1, -1.0f,
						&sub11[z+(z+1)*ld], ld,
						&sub11[(z+1)+z*ld], 1,
						&sub11[(z+1) + (z+1)*ld],ld);
			}
			break;
#ifdef STARPU_USE_CUDA
		case 1:
			for (z = 0; z < nx; z++)
			{
				float pivot;
				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
				cudaStreamSynchronize(starpu_cuda_get_local_stream());

				STARPU_ASSERT(pivot != 0.0f);
				
				cublasSscal(nx - z - 1, 1.0f/pivot, &sub11[z+(z+1)*ld], ld);
				
				cublasSger(nx - z - 1, nx - z - 1, -1.0f,
								&sub11[z+(z+1)*ld], ld,
								&sub11[(z+1)+z*ld], 1,
								&sub11[(z+1) + (z+1)*ld],ld);
			}

			cudaStreamSynchronize(starpu_cuda_get_local_stream());

			break;
#endif
		default:
			STARPU_ABORT();
			break;
	}
}
コード例 #7
0
ファイル: base_data_layer.cpp プロジェクト: beniz/caffe
void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
#ifndef CPU_ONLY
  cudaStream_t stream;
  cudaStream_t stream2;
  if (Caffe::mode() == Caffe::GPU) {
    CAFFE1_CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
    if (untransformed_top_)
      CAFFE1_CUDA_CHECK(cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking));
  }
#endif

  try {
    while (!must_stop()) {
      Batch<Dtype>* batch = prefetch_free_.pop();
      Batch<Dtype>* batch_untransformed = NULL;
      if (untransformed_top_)
        {
          batch_untransformed = prefetch_free_untransformed_.pop();
          load_batch_and_untransformed_batch(batch,batch_untransformed);
        }
      else
        load_batch(batch);

#ifndef CPU_ONLY
      if (Caffe::mode() == Caffe::GPU) {
        batch->data_.data().get()->async_gpu_push(stream);
        CAFFE1_CUDA_CHECK(cudaStreamSynchronize(stream));
        if (untransformed_top_)
          {
            batch_untransformed->data_.data().get()->async_gpu_push(stream2);
            CAFFE1_CUDA_CHECK(cudaStreamSynchronize(stream2));
          }
      }
#endif
      prefetch_full_.push(batch);
      if (untransformed_top_)
        prefetch_full_untransformed_.push(batch_untransformed);
    }
  } catch (boost::thread_interrupted&) {
    // Interrupted exception is expected on shutdown
  }
#ifndef CPU_ONLY
  if (Caffe::mode() == Caffe::GPU) {
    CAFFE1_CUDA_CHECK(cudaStreamDestroy(stream));
    if (untransformed_top_)
      CAFFE1_CUDA_CHECK(cudaStreamDestroy(stream2));
  }
#endif
}
コード例 #8
0
ファイル: FloatMatrixStrmm_c.c プロジェクト: hsyl20/HaskellPU
static void strmm_cuda(void *descr[], void *args) {
  float *a = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
  float *b = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
  float *c = (float *)STARPU_MATRIX_GET_PTR(descr[2]);

  unsigned w = STARPU_MATRIX_GET_NY(descr[0]);
  unsigned h = STARPU_MATRIX_GET_NX(descr[1]);

  unsigned lda = STARPU_MATRIX_GET_LD(descr[0]);
  unsigned ldb = STARPU_MATRIX_GET_LD(descr[1]);
  unsigned ldc = STARPU_MATRIX_GET_LD(descr[2]);
  
  struct strmm_arg * arg = (struct strmm_arg *)args;

  cublasSideMode_t side = arg->side ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
  cublasFillMode_t uplo = arg->uplo ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
  cublasDiagType_t diag = arg->unit ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
  cublasOperation_t trans = CUBLAS_OP_T;

  const float factor = 1.0f;
  
  cublasSetStream(cublas_handle, starpu_cuda_get_local_stream());

  cublasStrmm(cublas_handle, side, uplo, trans, diag, w, h, &factor, a, lda, b, ldb, c, ldc);
  cudaStreamSynchronize(starpu_cuda_get_local_stream());

  free(arg);
}
コード例 #9
0
ファイル: Host.c プロジェクト: Haider-BA/Matlab2CPP
/**
 * @brief This performs the exchanging of all necessary halos between 2 neighboring MPI processes
 *
 * @param[in]		cartComm	The carthesian MPI communicator
 * @param[in]		domSize		The 2D size of the local domain
 * @param[in]		topIndex	The 2D index of the calling MPI process in the topology
 * @param[in]		neighbors	The list of ranks which are direct neighbors to the caller
 * @param[in]		copyStream	The stream used to overlap top & bottom halo exchange with side halo copy to host memory
 * @param[in, out]	devBlocks	The 2 device blocks that are updated during the Jacobi run
 * @param[in, out]	devSideEdges	The 2 side edges (parallel to the Y direction) that hold the packed halo values before sending them
 * @param[in, out]	devHaloLines	The 2 halo lines (parallel to the Y direction) that hold the packed halo values after receiving them
 * @param[in, out] 	hostSendLines	The 2 host send buffers that are used during the halo exchange by the normal CUDA & MPI version
 * @param[in, out]	hostRecvLines	The 2 host receive buffers that are used during the halo exchange by the normal CUDA & MPI version
 * @return				The time spent during the MPI transfers
 */
double TransferAllHalos(MPI_Comm cartComm, const int2 * domSize, const int2 * topIndex, const int * neighbors, cudaStream_t copyStream,
	real * devBlocks[2], real * devSideEdges[2], real * devHaloLines[2], real * hostSendLines[2], real * hostRecvLines[2])
{
	real * devSendLines[2] = {devBlocks[0] + domSize->x + 3, devBlocks[0] + domSize->y * (domSize->x + 2) + 1};
	real * devRecvLines[2] = {devBlocks[0] + 1, devBlocks[0] + (domSize->y + 1) * (domSize->x + 2) + 1};
	int yNeighbors[2] = {neighbors[DIR_TOP], neighbors[DIR_BOTTOM]};
	int xNeighbors[2] = {neighbors[DIR_LEFT], neighbors[DIR_RIGHT]};
	int2 order = make_int2(topIndex->x % 2, topIndex->y % 2);
	double transferTime;

	// Populate the block's side edges
	CopyDevSideEdgesFromBlock(devBlocks[0], devSideEdges, domSize, neighbors, copyStream);

	// Exchange data with the top and bottom neighbors
	transferTime = MPI_Wtime();
	ExchangeHalos(cartComm, devSendLines[  order.y  ], hostSendLines[0], hostRecvLines[0], devRecvLines[  order.y  ], yNeighbors[  order.y  ], domSize->x);
	ExchangeHalos(cartComm, devSendLines[1 - order.y], hostSendLines[0], hostRecvLines[0], devRecvLines[1 - order.y], yNeighbors[1 - order.y], domSize->x);
	SafeCudaCall(cudaStreamSynchronize(copyStream));
	
	// Exchange data with the left and right neighbors
	ExchangeHalos(cartComm, devSideEdges[  order.x  ], hostSendLines[1], hostRecvLines[1], devHaloLines[  order.x  ], xNeighbors[  order.x  ], domSize->y);
	ExchangeHalos(cartComm, devSideEdges[1 - order.x], hostSendLines[1], hostRecvLines[1], devHaloLines[1 - order.x], xNeighbors[1 - order.x], domSize->y); 
	transferTime = MPI_Wtime() - transferTime;

	// Copy the received halos to the device block
	CopyDevHalosToBlock(devBlocks[0], devHaloLines[0], devHaloLines[1], domSize, neighbors);

	return transferTime;
}
コード例 #10
0
ファイル: forward.c プロジェクト: edenden/ixmap-cuda
void forward_process_synchronized(struct ixmapfwd_thread *thread,
	unsigned int port_index, struct ixmap_packet *packet,
	unsigned int num_packets, struct ixmap_packet_cuda *result, uint8_t *read_buf)
{
	int fd, i;

	cudaStreamSynchronize(thread->stream);

	for(i = 0; i < num_packets; i++){
		if(result[i].outif >= 0){
			ixmap_tx_assign(thread->plane, result[i].outif,
				thread->buf, &packet[i]);
		}else if(result[i].outif == -1){
			goto packet_drop;
		}else{
			goto packet_inject;
		}

		continue;
packet_inject:
		memcpy(read_buf, packet[i].slot_buf, packet[i].slot_size);
		fd = thread->tun_plane->ports[port_index].fd;
		write(fd, read_buf, packet[i].slot_size);
packet_drop:
		ixmap_slot_release(thread->buf, packet[i].slot_index);
	}
	return;
}
コード例 #11
0
void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
#ifndef CPU_ONLY
  cudaStream_t stream;//创建CUDA stream,非阻塞类型
  if (Caffe::mode() == Caffe::GPU) {
    CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  }
#endif

  try {
    while (!must_stop()) { //循环载入批量数据
      Batch<Dtype>* batch = prefetch_free_.pop();//拿到一个空闲batch
      load_batch(batch);//载入批量数据
#ifndef CPU_ONLY
      if (Caffe::mode() == Caffe::GPU) {
        batch->data_.data().get()->async_gpu_push(stream);
        if (this->output_labels_) {
          batch->label_.data().get()->async_gpu_push(stream);
        }
        CUDA_CHECK(cudaStreamSynchronize(stream));//同步到GPU
      }
#endif
      prefetch_full_.push(batch);//加入到带负载的Batch队列中
    }
  } catch (boost::thread_interrupted&) {//捕获异常,退出while循环
    // Interrupted exception is expected on shutdown
  }
#ifndef CPU_ONLY
  if (Caffe::mode() == Caffe::GPU) {
    CUDA_CHECK(cudaStreamDestroy(stream));//销毁CUDA stream
  }
#endif
}
コード例 #12
0
ファイル: CUDABLAS1.cpp プロジェクト: fast-project/lama
void CUDABLAS1::scal( IndexType n, const double alpha, double* x_d, const IndexType incx, SyncToken* syncToken )
{
    LAMA_CHECK_CUDA_ACCESS

    cudaStream_t stream = NULL;

    if ( syncToken )
    {
        CUDAStreamSyncToken* cudaStreamSyncToken = dynamic_cast<CUDAStreamSyncToken*>( syncToken );
        LAMA_ASSERT_DEBUG( cudaStreamSyncToken, "no cuda stream sync token provided" )
        stream = cudaStreamSyncToken->getCUDAStream();
    }

    cublasSetKernelStream( stream );
    LAMA_CHECK_CUBLAS_ERROR

    cublasDscal( n, alpha, x_d, incx );

    // No error check here possible as kernel is started asynchronously

    if ( !syncToken )
    {
        cudaStreamSynchronize( 0 );
        LAMA_CHECK_CUDA_ERROR
    }
コード例 #13
0
ファイル: cuda.c プロジェクト: anojavan/csinparallel
/* 
 * Advance the simulation by <n> generations by mapping the OpenGL pixel buffer
 * objects for writing from CUDA, executing the kernel <n> times, and unmapping
 * the pixel buffer object.
 */
void advance_generations(unsigned long n)
{
	uint8_t* device_bufs[2];
	size_t size;

	DEBUG2("Mapping CUDA resources and retrieving device buffer pointers\n");
	cudaGraphicsMapResources(2, cuda_graphics_resources, (cudaStream_t)0);

	cudaGraphicsResourceGetMappedPointer((void**)&device_bufs[0], &size, 
								cuda_graphics_resources[0]);

	cudaGraphicsResourceGetMappedPointer((void**)&device_bufs[1], &size, 
								cuda_graphics_resources[1]);

	check_cuda_error();

	while (n--) {

		DEBUG2("Launching kernel (grid.width = %u, grid.height = %u)\n",
				grid.width, grid.height);

		launch_kernel(device_bufs[grid.which_buf], device_bufs[!grid.which_buf], 
									grid.width, grid.height);

		grid.which_buf ^= 1;
	}

	DEBUG2("Unmapping CUDA resources\n");

	cudaGraphicsUnmapResources(2, cuda_graphics_resources, (cudaStream_t)0);
	cudaStreamSynchronize(0);
}
コード例 #14
0
ファイル: parallel.cpp プロジェクト: 20337112/caffe
void NCCL<Dtype>::run(int layer) {
  CHECK(solver_->param().layer_wise_reduce());
  vector<shared_ptr<Blob<Dtype> > >& blobs =
    solver_->net()->layers()[layer]->blobs();
#ifdef DEBUG
  // Assert blobs are contiguous to reduce in one step (e.g. bias often small)
  for (int i = 1; i < blobs.size(); ++i) {
    CHECK_EQ(blobs[i - 1]->gpu_diff() + blobs[i - 1]->count(),
             blobs[i + 0]->gpu_diff());
  }
#endif
  if (blobs.size() > 0) {
    // Make sure default stream is done computing gradients. Could be
    // replaced by cudaEventRecord+cudaStreamWaitEvent to avoid
    // blocking the default stream, but it's actually slower.
    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));

    // Reduce asynchronously
    int size = 0;
    for (int i = 0; i < blobs.size(); ++i) {
      size += blobs[i]->count();
    }
    if (barrier_) {  // NULL in multi process case
      barrier_->wait();
    }
    NCCL_CHECK(ncclAllReduce(blobs[0]->mutable_gpu_diff(),
                             blobs[0]->mutable_gpu_diff(),
                             size,
                             nccl::dataType<Dtype>::type,
                             ncclSum, comm_, stream_));
    caffe_gpu_scal(size, (Dtype) 1.0 / Caffe::solver_count(),
                   blobs[0]->mutable_gpu_diff(), stream_);
  }
}
コード例 #15
0
ファイル: controllers.cpp プロジェクト: CUHK-MMLAB/gkmeans
void KMeansController<Dtype>::PostProcess() {
    /** Use the cluster center calculated to get cluster labels**/

    int* label_data = this->int_outputs_[0]->mutable_cpu_data();

    // first restart the data provider.
    this->data_providers_[0]->ForceRestart();

    // assign cluster labels
    bool assignment_finished = false;
    while (!assignment_finished) {
        size_t batch_num = 0;
        size_t index = this->data_providers_[0]->current_index();

        // getting current index must precede GetData()
        // otherwise the current index in the data provide would be advanced
        this->mats_[0] = this->data_providers_[0]->GetData(batch_num);
        if ( this->data_providers_[0]->current_index()  == 0) {
            assignment_finished = true;
        }
        //execute only the maximization functions for all samples
        this->function_input_vecs_[0][0] = this->mats_[0].get();
        this->funcs_[0]->Execute(this->function_input_vecs_[0], this->function_output_vecs_[0], GKMeans::stream(0));
        CUDA_CHECK(cudaStreamSynchronize(GKMeans::stream(0)));

        // copy out cluster labels
        int* out_data = (int*)this->mats_[2]->cpu_data();
        for (size_t i = 0; i < batch_num; i++) {
            label_data[i + index] = out_data[i];
        }
    }

    // copy out center data
    memcpy(this->numeric_outputs_[0]->mutable_cpu_data(), this->mats_[1]->cpu_data(), this->mats_[1]->count() * sizeof(Dtype));
}
コード例 #16
0
ファイル: load_kernels.cpp プロジェクト: BerndDoser/Bonsai
void octree::resetCompact()
{
  // reset counts to 1 so next compact proceeds...
  cudaStreamSynchronize(execStream->s()); // must make sure any outstanding compact is finished  
  this->devMemCountsx[0] = 1;
  this->devMemCountsx.h2d(1, false, copyStream->s());
}
コード例 #17
0
ファイル: parallel.cpp プロジェクト: bbshocking/caffe
void P2PSync<Dtype>::on_gradients_ready(Timer* timer, ostringstream* timing) {
#ifndef CPU_ONLY
#ifdef DEBUG
  int device;
  CUDA_CHECK(cudaGetDevice(&device));
  CHECK(device == solver_->param().device_id());
#endif

  // Sum children gradients as they appear in the queue
  for (int i = 0; i < children_.size(); ++i) {
    timer->Start();
    P2PSync<Dtype> *child = queue_.pop();
    Dtype* src = child->parent_grads_;
    Dtype* dst = diff_;

#ifdef DEBUG
    bool ok = false;
    for (int j = 0; j < children_.size(); ++j) {
      if (child == children_[j]) {
        ok = true;
      }
    }
    CHECK(ok);
    cudaPointerAttributes attributes;
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
    CHECK(attributes.device == device);
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
    CHECK(attributes.device == device);
#endif

    caffe_gpu_add(size_, src, dst, dst);
    *timing << " add_grad: " << timer->MilliSeconds();
  }

  // Send gradients to parent
  if (parent_) {
    timer->Start();
    Dtype* src = diff_;
    Dtype* dst = parent_grads_;

#ifdef DEBUG
    cudaPointerAttributes attributes;
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
    CHECK(attributes.device == device);
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
    CHECK(attributes.device == parent_->solver_->param().device_id());
#endif

    CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),  //
        cudaMemcpyDeviceToDevice, cudaStreamDefault));
    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
    parent_->queue_.push(this);
    *timing << " send_grad: " << timer->MilliSeconds();
  } else {
    // Loss functions divide gradients by the batch size, so to compensate
    // for split batch, the root solver divides by number of solvers.
    caffe_gpu_scal(size_, Dtype(1.0 / Caffe::solver_count()), diff_);
  }
#endif
}
コード例 #18
0
TEST(MemcpyAsync, CheckReturnValues) {
    /**
     * The API documentation states that
     * cudaErrorInvalidDevicePointer is a valid return value for
     * cudaMemcpyAsync
     *
     * TODO;  This needs a test.
     */
    cudaError_t ret;
    cudaStream_t stream;

    ret = cudaStreamCreate(&stream);
    ASSERT_EQ(cudaSuccess, ret);

    /**
     * Test woefully out of range directions.
     */
    int a = 0;
    ret = cudaMemcpyAsync(&a,   &a,   sizeof(a), (cudaMemcpyKind) -1, stream);
    EXPECT_EQ(cudaErrorInvalidMemcpyDirection, ret);

    ret = cudaMemcpyAsync(NULL, NULL, sizeof(a), (cudaMemcpyKind) -1, stream);
    EXPECT_EQ(cudaErrorInvalidMemcpyDirection, ret);

    ret = cudaStreamSynchronize(stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaStreamDestroy(stream);
    EXPECT_EQ(cudaSuccess, ret);
}
コード例 #19
0
/**
 * CUDA4 introduced the cudaMemcpyDefault direction to cudaMemcpy.
 */
TEST(MemcpyAsync, CheckDefaultDirection) {
    cudaError_t ret;
    cudaStream_t stream;

    ret = cudaStreamCreate(&stream);
    ASSERT_EQ(cudaSuccess, ret);

    int a1 = 0;
    int a2 = 0;
    int * b;
    ret = cudaMalloc((void**) &b, sizeof(*b));
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaMemcpyAsync(&a1, &a2, sizeof(a1), cudaMemcpyDefault, stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaMemcpyAsync(&a1, b, sizeof(a1), cudaMemcpyDefault, stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaMemcpyAsync(b, &a1, sizeof(a1), cudaMemcpyDefault, stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaMemcpyAsync(b, b, sizeof(a1), cudaMemcpyDefault, stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaStreamSynchronize(stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaFree(b);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamDestroy(stream);
    EXPECT_EQ(cudaSuccess, ret);
}
コード例 #20
0
TEST_P(MemcpyAsync, H2DTransfers) {
    const size_t param = GetParam();
    const size_t alloc = 1 << param;

    cudaError_t ret;
    void *d1, *h1;
    ret = cudaMalloc(&d1, alloc);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaHostAlloc(&h1, alloc, cudaHostAllocMapped);
    ASSERT_EQ(cudaSuccess, ret);

    cudaStream_t stream;
    ret = cudaStreamCreate(&stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaMemcpyAsync(d1, h1, alloc, cudaMemcpyHostToDevice, stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamSynchronize(stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaFree(d1);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaFreeHost(h1);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamDestroy(stream);
    ASSERT_EQ(cudaSuccess, ret);
}
コード例 #21
0
TEST_P(MemcpyAsync, D2DTransfers) {
    const size_t param = GetParam();
    const size_t alloc = 1 << param;

    cudaError_t ret;
    void *d1, *d2;
    ret = cudaMalloc(&d1, alloc);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaMalloc(&d2, alloc);
    ASSERT_EQ(cudaSuccess, ret);

    cudaStream_t stream;
    ret = cudaStreamCreate(&stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaMemcpyAsync(d2, d1, alloc, cudaMemcpyDeviceToDevice, stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamSynchronize(stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaFree(d1);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaFree(d2);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamDestroy(stream);
    ASSERT_EQ(cudaSuccess, ret);
}
コード例 #22
0
ファイル: base_data_provider.cpp プロジェクト: caomw/gkmeans
  shared_ptr<Mat<Dtype>> DataProviderBase<Dtype>::GetData(size_t &num){

    //first join the async task
    num = num_future_.get();

    //also sync the cuda strea
    CUDA_CHECK(cudaStreamSynchronize(data_stream_));

    // get the mat with data filled
    int ready_idx = data_q_.front();
    shared_ptr<Mat<Dtype>> ready_mat = data_slot_vec_[ready_idx];

    // send the id of the mat to the end of the queue
    data_q_.pop_front(); data_q_.push_back(ready_idx);

    // get the mat to be filled (id at the front of the queue)
    Mat<Dtype>* working_mat = data_slot_vec_[data_q_.front()].get();
    // kickout async task (force async launch)
    num_future_ = std::async(std::launch::async, &DataProviderBase::AsyncFunc, this, working_mat);

    // increment the current data cursor.
    current_index_ += num;
    if (current_index_ == round_size_){
      current_index_ = 0;
    }

    return ready_mat;
  }
コード例 #23
0
ファイル: base_data_layer.cpp プロジェクト: AndrewChiyz/hed
void BasePrefetchingLabelmapDataLayer<Dtype>::InternalThreadEntry() {
#ifndef CPU_ONLY
  cudaStream_t stream;
  if (Caffe::mode() == Caffe::GPU) {
    CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  }
#endif

  try {
    while (!must_stop()) {
      LabelmapBatch<Dtype>* batch = prefetch_free_.pop();
      load_batch(batch);
#ifndef CPU_ONLY
      if (Caffe::mode() == Caffe::GPU) {
        batch->data_.data().get()->async_gpu_push(stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
      }
#endif
      prefetch_full_.push(batch);
    }
  } catch (boost::thread_interrupted&) {
    // Interrupted exception is expected on shutdown
  }
#ifndef CPU_ONLY
  if (Caffe::mode() == Caffe::GPU) {
    CUDA_CHECK(cudaStreamDestroy(stream));
  }
#endif
}
コード例 #24
0
ファイル: SentinelWalker.hpp プロジェクト: drabiega/cuda-wos
void
SentinelWalker<DistanceEvaluator, ResultsCalculator>::walk
(size_t gridSize, size_t blockSize, size_t minWalks,
 DistanceEvaluator& distance, ResultsCalculator& results)
{
  size_t threads = gridSize * blockSize;
  size_t walksPerThread = minWalks / threads + 1;
  
  srand(time(NULL));

  ResultsCalculator* d_results;
  cuda::check(__FILE__, __LINE__,
	      cudaMalloc((void**)&d_results, sizeof(ResultsCalculator)));
  cuda::check(__FILE__, __LINE__,
	      cudaMemcpy(d_results, &results, sizeof(ResultsCalculator),
			 cudaMemcpyHostToDevice));
  
  DistanceEvaluator* d_distance;
  cuda::check(__FILE__, __LINE__,
	      cudaMalloc((void**)&d_distance, sizeof(DistanceEvaluator)));
  cuda::check(__FILE__, __LINE__,
	      cudaMemcpy(d_distance, &distance, sizeof(DistanceEvaluator),
			 cudaMemcpyHostToDevice));

  cuda::check(__FILE__, __LINE__, cudaDeviceSynchronize());

  cudaStream_t workStream;
  cuda::check(cudaStreamCreateWithFlags(&workStream, cudaStreamNonBlocking));

  cudaStream_t sentinelStream;
  cuda::check(cudaStreamCreateWithFlags(&sentinelStream, cudaStreamNonBlocking));
  
  double start = seconds();
  
  gpuWalkWithSentinel<DistanceEvaluator><<<gridSize, blockSize, 0, workStream>>>
    (rand(), d_distance, d_results);

  size_t runningTotal;
  do {
    runningTotal = results.getRunningTotal(sentinelStream);
    //std::cout << runningTotal << std::endl;
  } while (runningTotal < minWalks);

  std::cout << "Stopping." << std::endl;
  
  results.endRun(sentinelStream);

  std::cout << "Stop signal sent." << std::endl;
  
  cuda::check(__FILE__, __LINE__, cudaStreamSynchronize(workStream));
  
  double timeSpent      = seconds() - start;
  double totalWalks     = walksPerThread * threads;
  double timePerWalk    = timeSpent / totalWalks;
  double walksPerSecond = totalWalks / timeSpent;
  
  std::cout << "Total GPU Time:   " << timeSpent      << "s"<< std::endl;
  std::cout << "Time Per Walk:    " << timePerWalk    << "s" << std::endl;
  std::cout << "Walks Per Second: " << walksPerSecond << "s" << std::endl;
}
コード例 #25
0
void THCStorage_(set)(THCState *state, THCStorage *self, ptrdiff_t index, scalar_t value)
{
  THArgCheck((index >= 0) && (index < self->numel()), 2, "index out of bounds");
  cudaStream_t stream = THCState_getCurrentStream(state);
  THCudaCheck(cudaMemcpyAsync(THCStorage_(data)(state, self) + index, &value, sizeof(scalar_t),
                              cudaMemcpyHostToDevice,
                              stream));
  THCudaCheck(cudaStreamSynchronize(stream));
}
コード例 #26
0
bool ControlCubeCache::_readElement(NodeLinkedList<index_node_t> * element)
{
	#ifndef NDEBUG
	if ((int)element->element > _maxNumCubes) 
	{
		std::cerr<<"Control Cube CPU Cache, try to write outside reserved memory"<<std::endl;
		throw;
	}
	#endif
	index_node_t idCube = element->id;
	float * cube = (_memory + element->element*_sizeElement);

	if (!checkCubeInside(element->id))
	{
		if (cudaSuccess != cudaMemset((void*)cube, 0, _sizeElement*sizeof(float)))
		{
			std::cout<<"---> "<<idCube<<" "<<_minValue<<" "<<_maxValue<<std::endl;
			LBERROR<<"Control Cube Cache: error copying to a device: "<<cudaGetErrorString(cudaGetLastError()) <<" "<<cube<<" "<<_sizeElement<<std::endl;
			throw;
		}
		return true;
	}

	index_node_t idCubeCPU = idCube >> 3*(_levelCube - _cpuCache->getCubeLevel()); 
	float * pCube = _cpuCache->getAndBlockElement(idCubeCPU);

	if (pCube != 0)
	{
		vmml::vector<3, int> coord = getMinBoxIndex2(idCube, _levelCube, _nLevels);
		vmml::vector<3, int> coordC = getMinBoxIndex2(idCubeCPU, _cpuCache->getCubeLevel(), _nLevels);
		coord -= coordC;
		vmml::vector<3, int> realDimCPU = _cpuCache->getRealCubeDim();

		cudaMemcpy3DParms myParms = {0};
		myParms.srcPtr = make_cudaPitchedPtr((void*)pCube, realDimCPU.z()*sizeof(float), realDimCPU.x(), realDimCPU.y()); 
		//myParms.dstPtr = make_cudaPitchedPtr((void*)cube, _realcubeDim.z()*sizeof(float), _realcubeDim.x(), _realcubeDim.y()); 
		myParms.dstPtr = make_cudaPitchedPtr((void*)cube, _dimCube*sizeof(float), _dimCube, _dimCube); 
		myParms.extent = make_cudaExtent(_dimCube*sizeof(float), _dimCube, _dimCube);
		myParms.dstPos = make_cudaPos(0,0,0);
		myParms.srcPos = make_cudaPos(coord.z()*sizeof(float), coord.y(), coord.x());
		myParms.kind = cudaMemcpyHostToDevice;

		if (cudaSuccess != cudaMemcpy3DAsync(&myParms, _stream) || cudaSuccess != cudaStreamSynchronize(_stream))
		{
			std::cout<<"---> "<<idCube<<" "<<_minValue<<" "<<_maxValue<<std::endl;
			LBERROR<<"Control Cube Cache: error copying to a device: "<<cudaGetErrorString(cudaGetLastError()) <<" "<<cube<<" "<<pCube<<" "<<_sizeElement<<std::endl;
			throw;
		}

		_cpuCache->unlockElement(idCubeCPU);
		return true;
	}
	else
	{
		return false;
	}
}
コード例 #27
0
TEST(MemcpyAsync, Pinned) {
    /**
     * Host memory must be pinned in order to be used as an argument to
     * cudaMemcpyAsync.  Panoptes only prints a warning about this error
     * rather than actually return an error via the CUDA API.  This test is
     * written as to check for the absence of an error once the CUDA
     * implementation starts returning one for nonpinned host memory.
     */
    const long page_size_ = sysconf(_SC_PAGESIZE);
    ASSERT_LT(0, page_size_);
    const size_t page_size = page_size_;

    const size_t pages = 3;
    assert(pages > 0);

    cudaError_t ret;
    cudaStream_t stream;

    uint8_t *device_ptr, *host_ptr;
    ret = cudaMalloc((void **) &device_ptr, pages * page_size);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaMallocHost((void **) &host_ptr, pages * page_size);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamCreate(&stream);
    ASSERT_EQ(cudaSuccess, ret);

    /* Page aligned transfers */
    for (size_t i = 0; i < pages; i++) {
        for (size_t j = i; j < pages; j++) {
            ret = cudaMemcpyAsync(device_ptr, host_ptr + i * page_size,
                (pages - j) * page_size, cudaMemcpyHostToDevice, stream);
            EXPECT_EQ(cudaSuccess, ret);

            ret = cudaMemcpyAsync(host_ptr + i * page_size, device_ptr,
                (pages - j) * page_size, cudaMemcpyDeviceToHost, stream);
            EXPECT_EQ(cudaSuccess, ret);
        }
    }

    /* Try a nonaligned transfer. */
    ret = cudaMemcpyAsync(device_ptr, host_ptr + (page_size / 2),
        page_size / 2, cudaMemcpyHostToDevice, stream);

    ret = cudaStreamSynchronize(stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaStreamDestroy(stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaFreeHost(host_ptr);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaFree(device_ptr);
    ASSERT_EQ(cudaSuccess, ret);
}
コード例 #28
0
void BasePrefetchingDataLayer<Ftype, Btype>::InternalThreadEntryN(size_t thread_id) {
#ifndef CPU_ONLY
  const bool use_gpu_transform = this->is_gpu_transform();
#endif
  static thread_local bool iter0 = this->phase_ == TRAIN;
  if (iter0 && this->net_inititialized_flag_ != nullptr) {
    this->net_inititialized_flag_->wait();
  } else {  // nothing to wait -> initialize and start pumping
    std::lock_guard<std::mutex> lock(mutex_in_);
    InitializePrefetch();
    start_reading();
    iter0 = false;
  }
  try {
    while (!must_stop(thread_id)) {
      const size_t qid = this->queue_id(thread_id);
#ifndef CPU_ONLY
      shared_ptr<Batch<Ftype>> batch = prefetches_free_[qid]->pop();

      CHECK_EQ((size_t) -1, batch->id());
      load_batch(batch.get(), thread_id, qid);
      if (Caffe::mode() == Caffe::GPU) {
        if (!use_gpu_transform) {
          batch->data_.async_gpu_push();
        }
        if (this->output_labels_) {
          batch->label_.async_gpu_push();
        }
        CUDA_CHECK(cudaStreamSynchronize(Caffe::th_stream_aux(Caffe::STREAM_ID_ASYNC_PUSH)));
      }

      prefetches_full_[qid]->push(batch);
#else
      shared_ptr<Batch<Ftype>> batch = prefetches_free_[qid]->pop();
      load_batch(batch.get(), thread_id, qid);
      prefetches_full_[qid]->push(batch);
#endif

      if (iter0) {
        if (this->net_iteration0_flag_ != nullptr) {
          this->net_iteration0_flag_->wait();
        }
        std::lock_guard<std::mutex> lock(mutex_out_);
        if (this->net_inititialized_flag_ != nullptr) {
          this->net_inititialized_flag_ = nullptr;  // no wait on the second round
          InitializePrefetch();
          start_reading();
        }
        if (this->auto_mode_) {
          break;
        }  // manual otherwise, thus keep rolling
        iter0 = false;
      }
    }
  } catch (boost::thread_interrupted&) {
  }
}
コード例 #29
0
ファイル: device.cpp プロジェクト: AI42/minerva
void GpuDevice::DoExecute(const DataList& in, const DataList& out, PhysicalOp& op, int thrid) {
  Context ctx;
  ctx.impl_type = ImplType::kCuda;
  ctx.stream = impl_->stream[thrid];
  ctx.cublas_handle = impl_->cublas_handle[thrid];
  ctx.cudnn_handle = impl_->cudnn_handle[thrid];
  op.compute_fn->Execute(in, out, ctx);
  CUDA_CALL_MSG(op.compute_fn->Name(), cudaStreamSynchronize(impl_->stream[thrid]));
}
コード例 #30
0
ファイル: HistogramWalker.hpp プロジェクト: drabiega/cuda-wos
void
HistogramWalker<DistanceEvaluator, ResultsCalculator>::walk
(size_t gridSize, size_t blockSize, size_t minWalks,
 DistanceEvaluator& distance, ResultsCalculator& results)
{
#ifdef PORTION
  std::cout << "Portion Walk Sizes:" << std::endl;
#endif
#ifdef EARLYSENTINEL
  std::cout << "Early Sentinel Walk Sizes:" << std::endl;
#endif
#ifdef WAITSENTINEL
  std::cout << "Wait Sentinel Walk Sizes:" << std::endl;
#endif

  size_t threads = gridSize * blockSize;
  size_t walksPerThread = minWalks / threads + 1;
  
  srand(time(NULL));

  ResultsCalculator* d_results;
  cuda::check(__FILE__, __LINE__,
	      cudaMalloc((void**)&d_results, sizeof(ResultsCalculator)));
  cuda::check(__FILE__, __LINE__,
	      cudaMemcpy(d_results, &results, sizeof(ResultsCalculator),
			 cudaMemcpyHostToDevice));
  
  DistanceEvaluator* d_distance;
  cuda::check(__FILE__, __LINE__,
	      cudaMalloc((void**)&d_distance, sizeof(DistanceEvaluator)));
  cuda::check(__FILE__, __LINE__,
	      cudaMemcpy(d_distance, &distance, sizeof(DistanceEvaluator),
			 cudaMemcpyHostToDevice));

  cuda::check(__FILE__, __LINE__, cudaDeviceSynchronize());

  cudaStream_t workStream;
  cuda::check(cudaStreamCreateWithFlags(&workStream, cudaStreamNonBlocking));

  cudaStream_t sentinelStream;
  cuda::check(cudaStreamCreateWithFlags(&sentinelStream, cudaStreamNonBlocking));
  
  double start = seconds();
  
  gpuWalkWithSentinel<DistanceEvaluator><<<gridSize, blockSize, 0, workStream>>>
    (walksPerThread, rand(), d_distance, d_results);

  size_t runningTotal;
  do {
    runningTotal = results.getRunningTotal(sentinelStream);
  } while (runningTotal < minWalks);
  
  results.endRun(sentinelStream);

  cuda::check(__FILE__, __LINE__, cudaStreamSynchronize(workStream));
}