Beispiel #1
0
/*
 * In some cases, need an implementation of memmove.  This is not fast, but
 * it is not often needed.
 */
void *opal_cuda_memmove(void *dest, void *src, size_t size)
{
    CUdeviceptr tmp;
    int res;

    res = cuMemAlloc(&tmp,size);
    res = cuMemcpy(tmp, (CUdeviceptr) src, size);
    if(res != CUDA_SUCCESS){
        opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
                    res, (void *)tmp, src, (int)size);
        abort();
    }
    res = cuMemcpy((CUdeviceptr) dest, tmp, size);
    if(res != CUDA_SUCCESS){
        opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
                    res, dest, (void *)tmp, (int)size);
        abort();
    }
    cuMemFree(tmp);
    return dest;
}
Beispiel #2
0
/*
 * Need intermediate cuMemcpy function so we can check the return code
 * of the call.  If we see an error, abort as there is no recovery at
 * this point.
 */
void *opal_cuda_memcpy(void *dest, void *src, size_t size)
{
    int res;
    res = cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
    if (res != CUDA_SUCCESS) {
        opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
                    res, dest, src, (int)size);
        abort();
    } else {
        return dest;
    }
}
Beispiel #3
0
SEXP R_auto_cuMemcpy(SEXP r_dst, SEXP r_src, SEXP r_ByteCount)
{
    SEXP r_ans = R_NilValue;
    CUdeviceptr dst = REAL(r_dst)[0];
    CUdeviceptr src = REAL(r_src)[0];
    size_t ByteCount = REAL(r_ByteCount)[0];
    
    CUresult ans;
    ans = cuMemcpy(dst, src, ByteCount);
    
    r_ans = Renum_convert_CUresult(ans) ;
    
    return(r_ans);
}
Beispiel #4
0
sortStatus_t sortArrayFromList(sortEngine_t engine, sortData_t data,
	SortTable table) {

	int bit = data->firstBit;

	// Clear the restore size counter at the start of the sort. If this is 
	// non-zero at the end of the sort (even if the sort failed), copy from the
	// restore buffer to the buffer that was at keys[0] when the sorting began.
	engine->restoreSourceSize = 0;
	CUdeviceptr firstKeysBuffer = data->keys[0];
	sortStatus_t status;

	// Loop through each element of the pass table.
	bool firstPass = true;
	for(int i(0); i < 6; ++i)
		for(int j(0); j < table.pass[i]; ++j) {
			int endBit = bit + i + 1;

			IntPair sortCode = GetSortCode(bit, endBit, data, firstPass);
			int earlyExit;
			status = sortPass(engine, data, table.numSortThreads, 
				table.valuesPerThread, table.useTransList, bit, endBit, 
				sortCode.first, sortCode.second, &earlyExit, data->parity);
			if(SORT_STATUS_SUCCESS != status) break;
			
			bit = endBit;
			if(3 == earlyExit) { i = 6; break; }
			if(2 == earlyExit) continue;
			firstPass = false;
		}

	// Restore the trailing keys.
	if(engine->restoreSourceSize) {
		CUdeviceptr target = AdjustPointer<uint>(firstKeysBuffer,
			data->numElements);
		CUresult result = cuMemcpy(target, engine->keyRestoreBuffer->Handle(), 
			4 * engine->restoreSourceSize);
		if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR;
	}
	
	return SORT_STATUS_SUCCESS;
}
//------------------------------------------------------------------------------
void mat_mul_test(const std::vector< std::string >& file_paths,
                  const char* kernel_name,
                  int size, 
                  int grid_dim_x,  int grid_dim_y,  int grid_dim_z,
                  int block_dim_x, int block_dim_y, int block_dim_z) {
    const int MATRIX_WIDTH = size; 
    const int MATRIX_HEIGHT = MATRIX_WIDTH; 
    const int VECTOR_SIZE = MATRIX_WIDTH;
    const int MATRIX_SIZE = MATRIX_WIDTH * MATRIX_HEIGHT;
    const int MATRIX_BYTE_SIZE = sizeof(real_t) * MATRIX_SIZE;
    const int VECTOR_BYTE_SIZE = sizeof(real_t) * VECTOR_SIZE;
   
    CCHECK(cuInit(0));
    array_t in_matrix_h(MATRIX_SIZE,  real_t(1));
    array_t in_vector_h(VECTOR_SIZE,  real_t(1));
    array_t out_vector_h(VECTOR_SIZE, real_t(0));
    CUdeviceptr in_matrix_d = 0;
    CUdeviceptr in_vector_d = 0;
    CUdeviceptr out_vector_d = 0;
    CUdevice device = CUdevice();
    CUcontext ctx = CUcontext();
    CCHECK(cuCtxCreate(&ctx, 0, device));
    CCHECK(cuMemAlloc(&in_matrix_d, MATRIX_BYTE_SIZE));
    assert(in_matrix_d);
    CCHECK(cuMemAlloc(&in_vector_d, VECTOR_BYTE_SIZE));
    assert(in_vector_d);
    CCHECK(cuMemAlloc(&out_vector_d, VECTOR_BYTE_SIZE));
    assert(out_vector_d);
    CCHECK(cuMemcpy(in_matrix_d,  CUdeviceptr(&in_matrix_h[0]),
           MATRIX_BYTE_SIZE));
    CCHECK(cuMemcpy(in_vector_d,  CUdeviceptr(&in_vector_h[0]),
           VECTOR_BYTE_SIZE));
    CCHECK(cuMemcpy(out_vector_d, CUdeviceptr(&out_vector_h[0]),
           VECTOR_BYTE_SIZE));
    CUmodule module = CUmodule();
    CUfunction fun = CUfunction(); 
   
    build(module, fun, file_paths, kernel_name);
       
    void* kernel_params[] = {&in_matrix_d,
                             (void *)(&MATRIX_WIDTH),
                             (void *)(&MATRIX_HEIGHT),
                             &in_vector_d,
                             &out_vector_d};
    CCHECK(cuLaunchKernel(fun, 
                    grid_dim_x, grid_dim_y, grid_dim_z,
                    block_dim_x, block_dim_y, block_dim_z,
                    0,//shared_mem_bytes
                    0,//stream
                    kernel_params,
                    0));
    
    CCHECK(cuMemcpy(CUdeviceptr(&out_vector_h[0]), out_vector_d,
                    VECTOR_BYTE_SIZE));
      
    // print first two and last elements
    std::cout << "vector[0]    = " << out_vector_h[ 0 ] << '\n';
    std::cout << "vector[1]    = " << out_vector_h[ 1 ] << '\n';
    std::cout << "vector[last] = " << out_vector_h.back() << std::endl;

    CCHECK(cuMemFree(in_matrix_d));
    CCHECK(cuMemFree(in_vector_d));
    CCHECK(cuMemFree(out_vector_d));
    CCHECK(cuModuleUnload(module));
    CCHECK(cuCtxDestroy(ctx));
}
Beispiel #6
0
void
nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
	    int async, unsigned *dims, void *targ_mem_desc)
{
  struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
  CUfunction function;
  CUresult r;
  int i;
  struct ptx_stream *dev_str;
  void *kargs[1];
  void *hp, *dp;
  struct nvptx_thread *nvthd = nvptx_thread ();
  const char *maybe_abort_msg = "(perhaps abort was called)";

  function = targ_fn->fn;

  dev_str = select_stream_for_async (async, pthread_self (), false, NULL);
  assert (dev_str == nvthd->current_stream);

  /* Initialize the launch dimensions.  Typically this is constant,
     provided by the device compiler, but we must permit runtime
     values.  */
  for (i = 0; i != 3; i++)
    if (targ_fn->launch->dim[i])
      dims[i] = targ_fn->launch->dim[i];

  /* This reserves a chunk of a pre-allocated page of memory mapped on both
     the host and the device. HP is a host pointer to the new chunk, and DP is
     the corresponding device pointer.  */
  map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);

  GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);

  /* Copy the array of arguments to the mapped page.  */
  for (i = 0; i < mapnum; i++)
    ((void **) hp)[i] = devaddrs[i];

  /* Copy the (device) pointers to arguments to the device (dp and hp might in
     fact have the same value on a unified-memory system).  */
  r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *));
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r));

  GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
		     " gangs=%u, workers=%u, vectors=%u\n",
		     __FUNCTION__, targ_fn->launch->fn,
		     dims[0], dims[1], dims[2]);

  // OpenACC		CUDA
  //
  // num_gangs		nctaid.x
  // num_workers	ntid.y
  // vector length	ntid.x

  kargs[0] = &dp;
  r = cuLaunchKernel (function,
		      dims[GOMP_DIM_GANG], 1, 1,
		      dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
		      0, dev_str->stream, kargs, 0);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));

#ifndef DISABLE_ASYNC
  if (async < acc_async_noval)
    {
      r = cuStreamSynchronize (dev_str->stream);
      if (r == CUDA_ERROR_LAUNCH_FAILED)
	GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
			   maybe_abort_msg);
      else if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
    }
  else
    {
      CUevent *e;

      e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));

      r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
      if (r == CUDA_ERROR_LAUNCH_FAILED)
	GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r),
			   maybe_abort_msg);
      else if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r));

      event_gc (true);

      r = cuEventRecord (*e, dev_str->stream);
      if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));

      event_add (PTX_EVT_KNL, e, (void *)dev_str);
    }
#else
  r = cuCtxSynchronize ();
  if (r == CUDA_ERROR_LAUNCH_FAILED)
    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
		       maybe_abort_msg);
  else if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
#endif

  GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
		     targ_fn->launch->fn);

#ifndef DISABLE_ASYNC
  if (async < acc_async_noval)
#endif
    map_pop (dev_str);
}
Beispiel #7
0
sortStatus_t sortPass(sortEngine_t engine, sortData_t data, int numSortThreads, 
	int valuesPerThread, bool useTransList, int firstBit, int endBit, 
	int endKeyFlags, int valueCode, int* earlyExitCode, int& parity) {

	if(data->numElements > data->maxElements) return SORT_STATUS_INVALID_VALUE;

	if((firstBit < 0) || (endBit > 32) || (endBit <= firstBit) || 
		((endBit - firstBit) > 6))
		return SORT_STATUS_INVALID_VALUE;
	
	int numBits = endBit - firstBit;

	SortTerms terms = ComputeSortTerms(numSortThreads, valuesPerThread,
		useTransList, numBits, data->numElements, engine->numSMs);

	sortEngine_d::HistKernel* hist;
	sortEngine_d::SortKernel* sort;
	CUresult result;
	sortStatus_t status = LoadKernels(engine, numSortThreads, valuesPerThread,
		useTransList, valueCode, &hist, &sort);
	if(SORT_STATUS_SUCCESS != status) return status;

	status = AllocSortResources(terms, engine);
	if(SORT_STATUS_SUCCESS != status) return status;
	
	// Set numHistRows into rangePairs if it hasn't already been set to this 
	// size.
	if(terms.numHistRows != engine->lastNumHistRowsProcessed) {
		int2* pairs = &engine->rangePairsHost[0];
		int numPairs = terms.numHistBlocks * NumHistWarps;
		int pairCount = terms.numHistRows / numPairs;
		int pairSplit = terms.numHistRows % numPairs;
		pairs[0].x = 0;
		for(int i = 0; i < numPairs; ++i) {
			if(i) pairs[i].x = pairs[i - 1].y;
			pairs[i].y = pairs[i].x + pairCount + (i < pairSplit);
		}

		// Copy rangePairsHost to device memory.
		CUresult result = engine->rangePairs->FromHost(
			&engine->rangePairsHost[0], numPairs);
		if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR;

		engine->lastNumHistRowsProcessed = terms.numHistRows;
	}

	// Save the trailing keys
	if((SORT_END_KEY_SAVE & endKeyFlags) && terms.numEndKeys) {
		engine->restoreSourceSize = terms.numEndKeys;
		CUdeviceptr source = AdjustPointer<uint>(data->keys[0],
			data->numElements);
		CUresult result = cuMemcpy(engine->keyRestoreBuffer->Handle(), source, 
			4 * engine->restoreSourceSize);
		if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR;
	}

	// Set the trailing keys to all set bits here.
	if((SORT_END_KEY_SET & endKeyFlags) && terms.numEndKeys) {
		// Back up the overwritten keys in the engine
		CUdeviceptr target = AdjustPointer<uint>(data->keys[0],
			data->numElements);
		CUresult result = cuMemsetD32(target, 0xffffffff, terms.numEndKeys);
		if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR;
	}

	// Run the count kernel
	if(data->earlyExit) engine->sortDetectCounters->Fill(0);

	CuCallStack callStack;
	callStack.Push(data->keys[0], firstBit, data->numElements, 
		terms.countValuesPerThread, engine->countBuffer);
	CuFunction* count = data->earlyExit ? 
		engine->count->eeFunctions[numBits - 1].get() :
		engine->count->functions[numBits - 1].get();
	result = count->Launch(terms.numCountBlocks, 1, callStack);
	if(CUDA_SUCCESS != result) return SORT_STATUS_LAUNCH_ERROR;

	*earlyExitCode = 0;
	if(data->earlyExit) {
		uint4 detect;
		result = engine->sortDetectCounters->ToHost(&detect, 1);
		if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR;

		uint radixSort = detect.x;
		uint fullCount = detect.y;
		uint radixCount = detect.z;

		if(terms.numCountBlocks == (int)fullCount) *earlyExitCode = 3;
		else if(terms.numCountBlocks == (int)radixCount) *earlyExitCode = 2;

		// If 5% of the sort blocks are sorted, use the slightly slower early 
		// exit sort kernel.
		else if((double)radixSort / terms.numSortBlocks > 0.05)
			*earlyExitCode = 1;
		else *earlyExitCode = 0;
	}

	if(*earlyExitCode <= 1) {

		// Run the three histogram kernels
		callStack.Reset();
		callStack.Push(engine->countBuffer, engine->rangePairs, 
			engine->countScan, engine->columnScan);
		result = hist->pass1[numBits - 1]->Launch(terms.numHistBlocks, 1,
			callStack);
		if(CUDA_SUCCESS != result) return SORT_STATUS_LAUNCH_ERROR;

		callStack.Reset();
		callStack.Push(terms.numHistBlocks, engine->countScan);
		result = hist->pass2[numBits - 1]->Launch(1, 1, callStack);
		if(CUDA_SUCCESS != result) return SORT_STATUS_LAUNCH_ERROR;

		callStack.Reset();
		callStack.Push(engine->countBuffer, engine->rangePairs, 
			engine->countScan, engine->columnScan, engine->bucketCodes,
			*earlyExitCode);
		result = hist->pass3[numBits - 1]->Launch(terms.numHistBlocks, 1,
			callStack);
		if(CUDA_SUCCESS != result) return SORT_STATUS_LAUNCH_ERROR;


		// Run the sort kernel
		// Because the max grid size is only 65535 in any dimension, large
		// sorts require multiple kernel launche.
		int MaxGridSize = 65535;
		int numSortLaunches = DivUp(terms.numSortBlocks, MaxGridSize);

		for(int launch(0); launch < numSortLaunches; ++launch) {
			int block = MaxGridSize * launch;
			int numBlocks = std::min(MaxGridSize, terms.numSortBlocks - block);

			callStack.Reset();
			callStack.Push(data->keys[0], block, engine->bucketCodes, firstBit, 
				data->keys[1]);
			
			switch(valueCode) {
				case 1:		// VALUE_TYPE_INDEX
					callStack.Push(data->values1[1]); 
					break;
				case 2:		// VALUE_TYPE_SINGLE
					callStack.Push(data->values1[0], data->values1[1]);
					break;
				case 3:		// VALUE_TYPE_MULTI
					callStack.Push(data->valueCount,
						// Six values_global_in
						data->values1[0], data->values2[0], data->values3[0],
						data->values4[0], data->values5[0], data->values6[0],
						
						// Six values_global_out
						data->values1[1], data->values2[1], data->values3[1],
						data->values4[1], data->values5[1], data->values6[1]);
					break;
			}

			CuFunction* sortFunc = *earlyExitCode ? 
				sort->eeFunctions[numBits - 1].get() :
				sort->functions[numBits - 1].get();

			result = sortFunc->Launch(numBlocks, 1, callStack);
			if(CUDA_SUCCESS != result) return SORT_STATUS_LAUNCH_ERROR;
		}

		// Swap the source and target buffers in the data structure.
		std::swap(data->keys[0], data->keys[1]);
		std::swap(data->values1[0], data->values1[1]);
		std::swap(data->values2[0], data->values2[1]);
		std::swap(data->values3[0], data->values3[1]);
		std::swap(data->values4[0], data->values4[1]);
		std::swap(data->values5[0], data->values5[1]);
		std::swap(data->values6[0], data->values6[1]);
		parity ^= 1;
	}

	return SORT_STATUS_SUCCESS;
}