/* * In some cases, need an implementation of memmove. This is not fast, but * it is not often needed. */ void *opal_cuda_memmove(void *dest, void *src, size_t size) { CUdeviceptr tmp; int res; res = cuMemAlloc(&tmp,size); res = cuMemcpy(tmp, (CUdeviceptr) src, size); if(res != CUDA_SUCCESS){ opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", res, (void *)tmp, src, (int)size); abort(); } res = cuMemcpy((CUdeviceptr) dest, tmp, size); if(res != CUDA_SUCCESS){ opal_output(0, "CUDA: memmove-Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", res, dest, (void *)tmp, (int)size); abort(); } cuMemFree(tmp); return dest; }
/* * Need intermediate cuMemcpy function so we can check the return code * of the call. If we see an error, abort as there is no recovery at * this point. */ void *opal_cuda_memcpy(void *dest, void *src, size_t size) { int res; res = cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size); if (res != CUDA_SUCCESS) { opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", res, dest, src, (int)size); abort(); } else { return dest; } }
SEXP R_auto_cuMemcpy(SEXP r_dst, SEXP r_src, SEXP r_ByteCount) { SEXP r_ans = R_NilValue; CUdeviceptr dst = REAL(r_dst)[0]; CUdeviceptr src = REAL(r_src)[0]; size_t ByteCount = REAL(r_ByteCount)[0]; CUresult ans; ans = cuMemcpy(dst, src, ByteCount); r_ans = Renum_convert_CUresult(ans) ; return(r_ans); }
sortStatus_t sortArrayFromList(sortEngine_t engine, sortData_t data, SortTable table) { int bit = data->firstBit; // Clear the restore size counter at the start of the sort. If this is // non-zero at the end of the sort (even if the sort failed), copy from the // restore buffer to the buffer that was at keys[0] when the sorting began. engine->restoreSourceSize = 0; CUdeviceptr firstKeysBuffer = data->keys[0]; sortStatus_t status; // Loop through each element of the pass table. bool firstPass = true; for(int i(0); i < 6; ++i) for(int j(0); j < table.pass[i]; ++j) { int endBit = bit + i + 1; IntPair sortCode = GetSortCode(bit, endBit, data, firstPass); int earlyExit; status = sortPass(engine, data, table.numSortThreads, table.valuesPerThread, table.useTransList, bit, endBit, sortCode.first, sortCode.second, &earlyExit, data->parity); if(SORT_STATUS_SUCCESS != status) break; bit = endBit; if(3 == earlyExit) { i = 6; break; } if(2 == earlyExit) continue; firstPass = false; } // Restore the trailing keys. if(engine->restoreSourceSize) { CUdeviceptr target = AdjustPointer<uint>(firstKeysBuffer, data->numElements); CUresult result = cuMemcpy(target, engine->keyRestoreBuffer->Handle(), 4 * engine->restoreSourceSize); if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR; } return SORT_STATUS_SUCCESS; }
//------------------------------------------------------------------------------ void mat_mul_test(const std::vector< std::string >& file_paths, const char* kernel_name, int size, int grid_dim_x, int grid_dim_y, int grid_dim_z, int block_dim_x, int block_dim_y, int block_dim_z) { const int MATRIX_WIDTH = size; const int MATRIX_HEIGHT = MATRIX_WIDTH; const int VECTOR_SIZE = MATRIX_WIDTH; const int MATRIX_SIZE = MATRIX_WIDTH * MATRIX_HEIGHT; const int MATRIX_BYTE_SIZE = sizeof(real_t) * MATRIX_SIZE; const int VECTOR_BYTE_SIZE = sizeof(real_t) * VECTOR_SIZE; CCHECK(cuInit(0)); array_t in_matrix_h(MATRIX_SIZE, real_t(1)); array_t in_vector_h(VECTOR_SIZE, real_t(1)); array_t out_vector_h(VECTOR_SIZE, real_t(0)); CUdeviceptr in_matrix_d = 0; CUdeviceptr in_vector_d = 0; CUdeviceptr out_vector_d = 0; CUdevice device = CUdevice(); CUcontext ctx = CUcontext(); CCHECK(cuCtxCreate(&ctx, 0, device)); CCHECK(cuMemAlloc(&in_matrix_d, MATRIX_BYTE_SIZE)); assert(in_matrix_d); CCHECK(cuMemAlloc(&in_vector_d, VECTOR_BYTE_SIZE)); assert(in_vector_d); CCHECK(cuMemAlloc(&out_vector_d, VECTOR_BYTE_SIZE)); assert(out_vector_d); CCHECK(cuMemcpy(in_matrix_d, CUdeviceptr(&in_matrix_h[0]), MATRIX_BYTE_SIZE)); CCHECK(cuMemcpy(in_vector_d, CUdeviceptr(&in_vector_h[0]), VECTOR_BYTE_SIZE)); CCHECK(cuMemcpy(out_vector_d, CUdeviceptr(&out_vector_h[0]), VECTOR_BYTE_SIZE)); CUmodule module = CUmodule(); CUfunction fun = CUfunction(); build(module, fun, file_paths, kernel_name); void* kernel_params[] = {&in_matrix_d, (void *)(&MATRIX_WIDTH), (void *)(&MATRIX_HEIGHT), &in_vector_d, &out_vector_d}; CCHECK(cuLaunchKernel(fun, grid_dim_x, grid_dim_y, grid_dim_z, block_dim_x, block_dim_y, block_dim_z, 0,//shared_mem_bytes 0,//stream kernel_params, 0)); CCHECK(cuMemcpy(CUdeviceptr(&out_vector_h[0]), out_vector_d, VECTOR_BYTE_SIZE)); // print first two and last elements std::cout << "vector[0] = " << out_vector_h[ 0 ] << '\n'; std::cout << "vector[1] = " << out_vector_h[ 1 ] << '\n'; std::cout << "vector[last] = " << out_vector_h.back() << std::endl; CCHECK(cuMemFree(in_matrix_d)); CCHECK(cuMemFree(in_vector_d)); CCHECK(cuMemFree(out_vector_d)); CCHECK(cuModuleUnload(module)); CCHECK(cuCtxDestroy(ctx)); }
void nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, int async, unsigned *dims, void *targ_mem_desc) { struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn; CUfunction function; CUresult r; int i; struct ptx_stream *dev_str; void *kargs[1]; void *hp, *dp; struct nvptx_thread *nvthd = nvptx_thread (); const char *maybe_abort_msg = "(perhaps abort was called)"; function = targ_fn->fn; dev_str = select_stream_for_async (async, pthread_self (), false, NULL); assert (dev_str == nvthd->current_stream); /* Initialize the launch dimensions. Typically this is constant, provided by the device compiler, but we must permit runtime values. */ for (i = 0; i != 3; i++) if (targ_fn->launch->dim[i]) dims[i] = targ_fn->launch->dim[i]; /* This reserves a chunk of a pre-allocated page of memory mapped on both the host and the device. HP is a host pointer to the new chunk, and DP is the corresponding device pointer. */ map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp); GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); /* Copy the array of arguments to the mapped page. */ for (i = 0; i < mapnum; i++) ((void **) hp)[i] = devaddrs[i]; /* Copy the (device) pointers to arguments to the device (dp and hp might in fact have the same value on a unified-memory system). */ r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *)); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r)); GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" " gangs=%u, workers=%u, vectors=%u\n", __FUNCTION__, targ_fn->launch->fn, dims[0], dims[1], dims[2]); // OpenACC CUDA // // num_gangs nctaid.x // num_workers ntid.y // vector length ntid.x kargs[0] = &dp; r = cuLaunchKernel (function, dims[GOMP_DIM_GANG], 1, 1, dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1, 0, dev_str->stream, kargs, 0); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); #ifndef DISABLE_ASYNC if (async < acc_async_noval) { r = cuStreamSynchronize (dev_str->stream); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); } else { CUevent *e; e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r), maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); event_gc (true); r = cuEventRecord (*e, dev_str->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); event_add (PTX_EVT_KNL, e, (void *)dev_str); } #else r = cuCtxSynchronize (); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r)); #endif GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__, targ_fn->launch->fn); #ifndef DISABLE_ASYNC if (async < acc_async_noval) #endif map_pop (dev_str); }
sortStatus_t sortPass(sortEngine_t engine, sortData_t data, int numSortThreads, int valuesPerThread, bool useTransList, int firstBit, int endBit, int endKeyFlags, int valueCode, int* earlyExitCode, int& parity) { if(data->numElements > data->maxElements) return SORT_STATUS_INVALID_VALUE; if((firstBit < 0) || (endBit > 32) || (endBit <= firstBit) || ((endBit - firstBit) > 6)) return SORT_STATUS_INVALID_VALUE; int numBits = endBit - firstBit; SortTerms terms = ComputeSortTerms(numSortThreads, valuesPerThread, useTransList, numBits, data->numElements, engine->numSMs); sortEngine_d::HistKernel* hist; sortEngine_d::SortKernel* sort; CUresult result; sortStatus_t status = LoadKernels(engine, numSortThreads, valuesPerThread, useTransList, valueCode, &hist, &sort); if(SORT_STATUS_SUCCESS != status) return status; status = AllocSortResources(terms, engine); if(SORT_STATUS_SUCCESS != status) return status; // Set numHistRows into rangePairs if it hasn't already been set to this // size. if(terms.numHistRows != engine->lastNumHistRowsProcessed) { int2* pairs = &engine->rangePairsHost[0]; int numPairs = terms.numHistBlocks * NumHistWarps; int pairCount = terms.numHistRows / numPairs; int pairSplit = terms.numHistRows % numPairs; pairs[0].x = 0; for(int i = 0; i < numPairs; ++i) { if(i) pairs[i].x = pairs[i - 1].y; pairs[i].y = pairs[i].x + pairCount + (i < pairSplit); } // Copy rangePairsHost to device memory. CUresult result = engine->rangePairs->FromHost( &engine->rangePairsHost[0], numPairs); if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR; engine->lastNumHistRowsProcessed = terms.numHistRows; } // Save the trailing keys if((SORT_END_KEY_SAVE & endKeyFlags) && terms.numEndKeys) { engine->restoreSourceSize = terms.numEndKeys; CUdeviceptr source = AdjustPointer<uint>(data->keys[0], data->numElements); CUresult result = cuMemcpy(engine->keyRestoreBuffer->Handle(), source, 4 * engine->restoreSourceSize); if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR; } // Set the trailing keys to all set bits here. if((SORT_END_KEY_SET & endKeyFlags) && terms.numEndKeys) { // Back up the overwritten keys in the engine CUdeviceptr target = AdjustPointer<uint>(data->keys[0], data->numElements); CUresult result = cuMemsetD32(target, 0xffffffff, terms.numEndKeys); if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR; } // Run the count kernel if(data->earlyExit) engine->sortDetectCounters->Fill(0); CuCallStack callStack; callStack.Push(data->keys[0], firstBit, data->numElements, terms.countValuesPerThread, engine->countBuffer); CuFunction* count = data->earlyExit ? engine->count->eeFunctions[numBits - 1].get() : engine->count->functions[numBits - 1].get(); result = count->Launch(terms.numCountBlocks, 1, callStack); if(CUDA_SUCCESS != result) return SORT_STATUS_LAUNCH_ERROR; *earlyExitCode = 0; if(data->earlyExit) { uint4 detect; result = engine->sortDetectCounters->ToHost(&detect, 1); if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR; uint radixSort = detect.x; uint fullCount = detect.y; uint radixCount = detect.z; if(terms.numCountBlocks == (int)fullCount) *earlyExitCode = 3; else if(terms.numCountBlocks == (int)radixCount) *earlyExitCode = 2; // If 5% of the sort blocks are sorted, use the slightly slower early // exit sort kernel. else if((double)radixSort / terms.numSortBlocks > 0.05) *earlyExitCode = 1; else *earlyExitCode = 0; } if(*earlyExitCode <= 1) { // Run the three histogram kernels callStack.Reset(); callStack.Push(engine->countBuffer, engine->rangePairs, engine->countScan, engine->columnScan); result = hist->pass1[numBits - 1]->Launch(terms.numHistBlocks, 1, callStack); if(CUDA_SUCCESS != result) return SORT_STATUS_LAUNCH_ERROR; callStack.Reset(); callStack.Push(terms.numHistBlocks, engine->countScan); result = hist->pass2[numBits - 1]->Launch(1, 1, callStack); if(CUDA_SUCCESS != result) return SORT_STATUS_LAUNCH_ERROR; callStack.Reset(); callStack.Push(engine->countBuffer, engine->rangePairs, engine->countScan, engine->columnScan, engine->bucketCodes, *earlyExitCode); result = hist->pass3[numBits - 1]->Launch(terms.numHistBlocks, 1, callStack); if(CUDA_SUCCESS != result) return SORT_STATUS_LAUNCH_ERROR; // Run the sort kernel // Because the max grid size is only 65535 in any dimension, large // sorts require multiple kernel launche. int MaxGridSize = 65535; int numSortLaunches = DivUp(terms.numSortBlocks, MaxGridSize); for(int launch(0); launch < numSortLaunches; ++launch) { int block = MaxGridSize * launch; int numBlocks = std::min(MaxGridSize, terms.numSortBlocks - block); callStack.Reset(); callStack.Push(data->keys[0], block, engine->bucketCodes, firstBit, data->keys[1]); switch(valueCode) { case 1: // VALUE_TYPE_INDEX callStack.Push(data->values1[1]); break; case 2: // VALUE_TYPE_SINGLE callStack.Push(data->values1[0], data->values1[1]); break; case 3: // VALUE_TYPE_MULTI callStack.Push(data->valueCount, // Six values_global_in data->values1[0], data->values2[0], data->values3[0], data->values4[0], data->values5[0], data->values6[0], // Six values_global_out data->values1[1], data->values2[1], data->values3[1], data->values4[1], data->values5[1], data->values6[1]); break; } CuFunction* sortFunc = *earlyExitCode ? sort->eeFunctions[numBits - 1].get() : sort->functions[numBits - 1].get(); result = sortFunc->Launch(numBlocks, 1, callStack); if(CUDA_SUCCESS != result) return SORT_STATUS_LAUNCH_ERROR; } // Swap the source and target buffers in the data structure. std::swap(data->keys[0], data->keys[1]); std::swap(data->values1[0], data->values1[1]); std::swap(data->values2[0], data->values2[1]); std::swap(data->values3[0], data->values3[1]); std::swap(data->values4[0], data->values4[1]); std::swap(data->values5[0], data->values5[1]); std::swap(data->values6[0], data->values6[1]); parity ^= 1; } return SORT_STATUS_SUCCESS; }