sortStatus_t SORTAPI sortDevice(sortEngine_t engine, CUdeviceptr keys, CUdeviceptr values, int numElements, int numBits) { MgpuSortData data; data.AttachKey(keys); if(values) data.AttachVal(0, values); sortStatus_t status = data.Alloc(engine, numElements, values ? 1 : 0); if(SORT_STATUS_SUCCESS != status) return status; data.endBit = numBits; status = sortArray(engine, &data); if(SORT_STATUS_SUCCESS != status) return status; if(data.parity) { CUresult result = cuMemcpyDtoD(keys, data.keys[1], sizeof(uint) * numElements); if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR; if(values) { cuMemcpyDtoD(values, data.values1[1], sizeof(uint) * numElements); if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR; } } return SORT_STATUS_SUCCESS; }
CUresult CuDeviceMem::FromDevice(size_t targetOffset, CUdeviceptr source, size_t size) { if(targetOffset + size > _size) return CUDA_ERROR_INVALID_VALUE; CUresult result = cuMemcpyDtoD(_deviceptr + targetOffset, source, size); return result; }
void swanMemcpyDtoD( void *psrc, void *pdest, size_t len ) { CUresult err = cuMemcpyDtoD( PTR_TO_CUDEVPTR( pdest ), PTR_TO_CUDEVPTR(psrc), len ); // err=cuCtxSynchronize(); // if ( err != CUDA_SUCCESS ) { // error("swanMemcpyDtoD sync failed\n" ); // } if ( err != CUDA_SUCCESS ) { error("swanMemcpyDtoD failed\n" ); } }
CUresult I_WRAP_SONAME_FNNAME_ZZ(libcudaZdsoZa, cuMemcpyDtoDAsync)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream) { int error = 0; long vgErrorAddress; vgErrorAddress = VALGRIND_CHECK_MEM_IS_DEFINED(&hStream, sizeof(CUstream)); if (vgErrorAddress) { error++; VALGRIND_PRINTF("Error: 'hStream' in call to cuMemcpyDtoDAsync not defined.\n"); } cgLock(); CUcontext ctx = NULL; cgCtxListType *nodeCtx; cgMemListType *nodeMemDst, *nodeMemSrc; // Get current context .. cgGetCtx(&ctx); nodeCtx = cgFindCtx(ctx); // .. and locate memory if we are handling device memory nodeMemDst = cgFindMem(nodeCtx, dstDevice); nodeMemSrc = cgFindMem(nodeCtx, srcDevice); if (nodeMemDst && nodeMemDst->locked & 2 && nodeMemDst->stream != hStream) { error++; VALGRIND_PRINTF("Error: Concurrent write and read access by different streams.\n"); } if (nodeMemSrc && nodeMemSrc->locked && nodeMemSrc->stream != hStream) { error++; VALGRIND_PRINTF("Error: Concurrent write and read access by different streams.\n"); } if (nodeMemDst) { nodeMemDst->locked = nodeMemDst->locked | 2; nodeMemDst->stream = hStream; } if (nodeMemSrc) { nodeMemSrc->locked = nodeMemSrc->locked | 1; nodeMemSrc->stream = hStream; } cgUnlock(); if (error) { VALGRIND_PRINTF_BACKTRACE(""); } return cuMemcpyDtoD(dstDevice, srcDevice, ByteCount); }
void GPUInterface::MemcpyDeviceToDevice(GPUPtr dest, GPUPtr src, size_t memSize) { #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr, "\t\t\tEntering GPUInterface::MemcpyDeviceToDevice\n"); #endif SAFE_CUPP(cuMemcpyDtoD(dest, src, memSize)); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr, "\t\t\tLeaving GPUInterface::MemcpyDeviceToDevice\n"); #endif }
int mcopy_gpu(struct device_info *device_info, CUdeviceptr *a_dev, CUdeviceptr *b_dev, CUdeviceptr *c_dev, unsigned int rows, unsigned int cols) { CUresult res; /* copy matrix a_dev to c_dev */ if ((res = cuMemcpyDtoD(*c_dev, *a_dev, rows * cols * sizeof(unsigned int))) != CUDA_SUCCESS) { printf("cuMemcpyDtoD failed: res = %lu\n", (unsigned long)res); return -1; } return 0; }
void memory_t<CUDA>::copyTo(memory_v *dest, const uintptr_t bytes, const uintptr_t destOffset, const uintptr_t srcOffset){ const uintptr_t bytes_ = (bytes == 0) ? size : bytes; OCCA_CHECK((bytes_ + srcOffset) <= size); OCCA_CHECK((bytes_ + destOffset) <= dest->size); void *dstPtr, *srcPtr; if(!isTexture) srcPtr = (void*) ((CUDATextureData_t*) handle)->array; else srcPtr = handle; if( !(dest->isTexture) ) dstPtr = (void*) ((CUDATextureData_t*) dest->handle)->array; else dstPtr = dest->handle; if(!isTexture){ if(!dest->isTexture) OCCA_CUDA_CHECK("Memory: Copy To [Memory -> Memory]", cuMemcpyDtoD(*((CUdeviceptr*) dstPtr) + destOffset, *((CUdeviceptr*) srcPtr) + srcOffset, bytes_) ); else OCCA_CUDA_CHECK("Memory: Copy To [Memory -> Texture]", cuMemcpyDtoA((CUarray) dstPtr , destOffset, *((CUdeviceptr*) srcPtr) + srcOffset, bytes_) ); } else{ if(dest->isTexture) OCCA_CUDA_CHECK("Memory: Copy To [Texture -> Memory]", cuMemcpyAtoD(*((CUdeviceptr*) dstPtr) + destOffset, (CUarray) srcPtr , srcOffset, bytes_) ); else OCCA_CUDA_CHECK("Memory: Copy To [Texture -> Texture]", cuMemcpyAtoA((CUarray) dstPtr, destOffset, (CUarray) srcPtr, srcOffset, bytes_) ); } }
pocl_cuda_write (void *data, const void *host_ptr, void *device_ptr, size_t offset, size_t cb) { CUresult result = cuMemcpyHtoD ((CUdeviceptr) (device_ptr + offset), host_ptr, cb); CUDA_CHECK (result, "cuMemcpyHtoD"); } void pocl_cuda_copy (void *data, const void *src_ptr, size_t src_offset, void *__restrict__ dst_ptr, size_t dst_offset, size_t cb) { if (src_ptr == dst_ptr) return; CUresult result = cuMemcpyDtoD ((CUdeviceptr) (dst_ptr + dst_offset), (CUdeviceptr) (src_ptr + src_offset), cb); CUDA_CHECK (result, "cuMemcpyDtoD"); } void pocl_cuda_read_rect (void *data, void *__restrict__ const host_ptr, void *__restrict__ const device_ptr, const size_t *__restrict__ const buffer_origin, const size_t *__restrict__ const host_origin, const size_t *__restrict__ const region, size_t const buffer_row_pitch, size_t const buffer_slice_pitch, size_t const host_row_pitch, size_t const host_slice_pitch) { CUDA_MEMCPY3D params = { 0 };
int main() { CU_ERROR_CHECK(cuInit(0)); int count; CU_ERROR_CHECK(cuDeviceGetCount(&count)); count = (count > 2) ? 2 : count; CUdevice devices[count]; for (int i = 0; i < count; i++) CU_ERROR_CHECK(cuDeviceGet(&devices[i], i)); // Question 1: Can you create multiple contexts on the same device? { fprintf(stderr, "Attempting to create multiple contexts on each device...\n"); CUcontext contexts[count * N]; size_t j = 0; for (int i = 0; i < count; i++) { CUresult error = CUDA_SUCCESS; size_t k; for (k = 0; k < N && error == CUDA_SUCCESS; k++) { error = cuCtxCreate(&contexts[j], CU_CTX_SCHED_AUTO, devices[i]); if (error == CUDA_SUCCESS) CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[j++])); } fprintf(stderr, " created %zu contexts on device %d before cuCtxCreate returned \"%s\"\n", (k - 1), i, cuGetErrorString(error)); } CUresult error = CUDA_SUCCESS; size_t k; for (k = 0; k < j && error == CUDA_SUCCESS; k++) error = cuCtxPushCurrent(contexts[k]); if (error == CUDA_SUCCESS) fprintf(stderr, " successfully pushed %zu contexts with cuCtxPushCurrent\n", k); else fprintf(stderr, " pushed %zu contexts before cuCtxPushCurrent returned \"%s\"\n", (k - 1), cuGetErrorString(error)); for (size_t k = 0; k < j; k++) CU_ERROR_CHECK(cuCtxDestroy(contexts[k])); fprintf(stderr, "\n"); } CUcontext contexts[count][2]; for (int i = 0; i < count; i++) { for (size_t j = 0; j < 2; j++) { CU_ERROR_CHECK(cuCtxCreate(&contexts[i][j], CU_CTX_SCHED_AUTO, devices[i])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[i][j])); } } // Question 2: Can you access a host pointer in a different context from // which it was created? // Question 3: Can you free a host pointer in a different context from which // it was created? { void * hPtr; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuMemAllocHost(&hPtr, 1024)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); CUdeviceptr dPtr[count]; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[0], 1024)); // Different context, same device fprintf(stderr, "Accessing a host pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(cuMemcpyHtoD(dPtr[0], hPtr, 1024))); CU_ERROR_CHECK(cuMemFree(dPtr[0])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); if (count > 1) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[1], 1024)); // Different context, different device fprintf(stderr, "Accessing a host pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(cuMemcpyHtoD(dPtr[1], hPtr, 1024))); CU_ERROR_CHECK(cuMemFree(dPtr[1])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } fprintf(stderr, "\n"); CUresult error = CUDA_ERROR_UNKNOWN; if (count > 1) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); error = cuMemFreeHost(hPtr); fprintf(stderr, "Freeing a host pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); error = cuMemFreeHost(hPtr); fprintf(stderr, "Freeing a host pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); error = cuMemFreeHost(hPtr); fprintf(stderr, "Freeing a host pointer from the same context to which it was allocated returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } fprintf(stderr, "\n"); } // Question 4: Can you access a device pointer in a different context from // which it was created? // Question 5: Can you free a device pointer in a different context from which // it was created? { CUdeviceptr dPtr[count][2]; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[0][0], 1024)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[0][1], 1024)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); fprintf(stderr, "Accessing a device pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(cuMemcpyDtoD(dPtr[0][0], dPtr[0][1], 1024))); CU_ERROR_CHECK(cuMemFree(dPtr[0][1])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); if (count > 1) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); CU_ERROR_CHECK(cuMemAlloc(&dPtr[1][0], 1024)); // Different context, different device fprintf(stderr, "Accessing a device pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(cuMemcpyDtoD(dPtr[0][0], dPtr[1][0], 1024))); CU_ERROR_CHECK(cuMemFree(dPtr[1][0])); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } fprintf(stderr, "\n"); CUresult error = CUDA_ERROR_UNKNOWN; if (count > 1) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); error = cuMemFree(dPtr[0][0]); fprintf(stderr, "Freeing a device pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); error = cuMemFree(dPtr[0][0]); fprintf(stderr, "Freeing a device pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); error = cuMemFree(dPtr[0][0]); fprintf(stderr, "Freeing a device pointer from the same context to which it was allocated returns \"%s\"\n", cuGetErrorString(error)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } fprintf(stderr, "\n"); } // Question 6: Can you access a module in a different context from which it // was loaded? // Question 7: Can you unload a module in a different context from which it // was loaded? { CUmodule module; CUdeviceptr ptr; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuModuleLoad(&module, "kernel-test.ptx")); CU_ERROR_CHECK(cuMemAlloc(&ptr, sizeof(float))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); CUfunction function = 0; if (count > 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); fprintf(stderr, "Getting a function pointer from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel"))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (function == 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); fprintf(stderr, "Getting a function pointer from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel"))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (function == 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); fprintf(stderr, "Getting a function pointer from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel"))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } fprintf(stderr, "\n"); CUdeviceptr a, b; CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuMemAlloc(&a, sizeof(float))); CU_ERROR_CHECK(cuMemAlloc(&b, sizeof(float))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); void * params[] = { &a, & b }; CUresult error = CUDA_ERROR_UNKNOWN; if (count > 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); fprintf(stderr, "Launching a function from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); fprintf(stderr, "Launching a function from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); fprintf(stderr, "Launching a function from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } fprintf(stderr, "\n"); error = CUDA_ERROR_UNKNOWN; if (count > 0) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0])); fprintf(stderr, "Unloading a module from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1])); fprintf(stderr, "Unloading a module from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1])); } if (error != CUDA_SUCCESS) { CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); fprintf(stderr, "Unloading a module from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module))); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0])); CU_ERROR_CHECK(cuMemFree(a)); CU_ERROR_CHECK(cuMemFree(b)); CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0])); } for (int i = 0; i < count; i++) { for (size_t j = 0; j < 2; j++) CU_ERROR_CHECK(cuCtxDestroy(contexts[i][j])); } return 0; }
CUresult CuDeviceMem::FromDevice(CUdeviceptr source) { CUresult result = cuMemcpyDtoD(_deviceptr, source, _size); return result; }
CUresult CuDeviceMem::ToDevice(size_t sourceOffset, CUdeviceptr target, size_t size) { CUresult result = cuMemcpyDtoD(target, _deviceptr + sourceOffset, size); return result; }
CUresult CuDeviceMem::ToDevice(CUdeviceptr target) { CUresult result = cuMemcpyDtoD(target, _deviceptr, _size); return result; }