Example #1
0
sortStatus_t SORTAPI sortDevice(sortEngine_t engine, CUdeviceptr keys, 
	CUdeviceptr values, int numElements, int numBits) {

	MgpuSortData data;
	data.AttachKey(keys);
	if(values) data.AttachVal(0, values);
	sortStatus_t status = data.Alloc(engine, numElements, values ? 1 : 0);
	if(SORT_STATUS_SUCCESS != status) return status;

	data.endBit = numBits;

	status = sortArray(engine, &data);
	if(SORT_STATUS_SUCCESS != status) return status;

	if(data.parity) {
		CUresult result = cuMemcpyDtoD(keys, data.keys[1],
			sizeof(uint) * numElements);
		if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR;
		if(values) {
			cuMemcpyDtoD(values, data.values1[1], sizeof(uint) * numElements);
			if(CUDA_SUCCESS != result) return SORT_STATUS_DEVICE_ERROR;
		}
	}
	return SORT_STATUS_SUCCESS;
}
Example #2
0
CUresult CuDeviceMem::FromDevice(size_t targetOffset, CUdeviceptr source, 
	size_t size) {

	if(targetOffset + size > _size) return CUDA_ERROR_INVALID_VALUE;
	CUresult result = cuMemcpyDtoD(_deviceptr + targetOffset, source, size);
	return result;
}
Example #3
0
void swanMemcpyDtoD( void *psrc, void *pdest, size_t len ) {
	CUresult err = cuMemcpyDtoD(  PTR_TO_CUDEVPTR( pdest ), PTR_TO_CUDEVPTR(psrc), len );
	
//	err=cuCtxSynchronize();
//	if ( err != CUDA_SUCCESS ) {
//		error("swanMemcpyDtoD sync failed\n" );
//	}
	if ( err != CUDA_SUCCESS ) {
		error("swanMemcpyDtoD failed\n" );
	}

}
Example #4
0
CUresult I_WRAP_SONAME_FNNAME_ZZ(libcudaZdsoZa, cuMemcpyDtoDAsync)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream) {
   int error = 0;
   long vgErrorAddress;
   
   vgErrorAddress = VALGRIND_CHECK_MEM_IS_DEFINED(&hStream, sizeof(CUstream));
   if (vgErrorAddress) {
      error++;
      VALGRIND_PRINTF("Error: 'hStream' in call to cuMemcpyDtoDAsync not defined.\n");
   }

   cgLock();
   
   CUcontext ctx = NULL;
   cgCtxListType *nodeCtx;
   cgMemListType *nodeMemDst, *nodeMemSrc;
   
   
   // Get current context ..
   cgGetCtx(&ctx);
   nodeCtx = cgFindCtx(ctx);
   
   // .. and locate memory if we are handling device memory
   nodeMemDst = cgFindMem(nodeCtx, dstDevice);
   nodeMemSrc = cgFindMem(nodeCtx, srcDevice);
   
   if (nodeMemDst && nodeMemDst->locked & 2 && nodeMemDst->stream != hStream) {
      error++;
      VALGRIND_PRINTF("Error: Concurrent write and read access by different streams.\n");
   }
   if (nodeMemSrc && nodeMemSrc->locked && nodeMemSrc->stream != hStream) {
      error++;
      VALGRIND_PRINTF("Error: Concurrent write and read access by different streams.\n");
   }
   
   if (nodeMemDst) {
      nodeMemDst->locked = nodeMemDst->locked | 2;
      nodeMemDst->stream = hStream;
   }
   if (nodeMemSrc) {
      nodeMemSrc->locked = nodeMemSrc->locked | 1;
      nodeMemSrc->stream = hStream;
   }
   
   cgUnlock();
   
   if (error) {
      VALGRIND_PRINTF_BACKTRACE("");
   }
   
   return cuMemcpyDtoD(dstDevice, srcDevice, ByteCount);
}
void GPUInterface::MemcpyDeviceToDevice(GPUPtr dest,
                                        GPUPtr src,
                                        size_t memSize) {
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr, "\t\t\tEntering GPUInterface::MemcpyDeviceToDevice\n");
#endif

    SAFE_CUPP(cuMemcpyDtoD(dest, src, memSize));

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr, "\t\t\tLeaving  GPUInterface::MemcpyDeviceToDevice\n");
#endif

}
Example #6
0
int mcopy_gpu(struct device_info *device_info, CUdeviceptr *a_dev, CUdeviceptr
 *b_dev, CUdeviceptr *c_dev, unsigned int rows, unsigned int cols)
{
	CUresult res;

	/* copy matrix a_dev to c_dev */
	if ((res = cuMemcpyDtoD(*c_dev, *a_dev, rows * cols *
	 sizeof(unsigned int))) != CUDA_SUCCESS) {
		printf("cuMemcpyDtoD failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	return 0;
}
Example #7
0
  void memory_t<CUDA>::copyTo(memory_v *dest,
                              const uintptr_t bytes,
                              const uintptr_t destOffset,
                              const uintptr_t srcOffset){
    const uintptr_t bytes_ = (bytes == 0) ? size : bytes;

    OCCA_CHECK((bytes_ + srcOffset)  <= size);
    OCCA_CHECK((bytes_ + destOffset) <= dest->size);

    void *dstPtr, *srcPtr;

    if(!isTexture)
      srcPtr = (void*) ((CUDATextureData_t*) handle)->array;
    else
      srcPtr = handle;

    if( !(dest->isTexture) )
      dstPtr = (void*) ((CUDATextureData_t*) dest->handle)->array;
    else
      dstPtr = dest->handle;

    if(!isTexture){
      if(!dest->isTexture)
        OCCA_CUDA_CHECK("Memory: Copy To [Memory -> Memory]",
                        cuMemcpyDtoD(*((CUdeviceptr*) dstPtr) + destOffset,
                                     *((CUdeviceptr*) srcPtr) + srcOffset,
                                     bytes_) );
      else
        OCCA_CUDA_CHECK("Memory: Copy To [Memory -> Texture]",
                        cuMemcpyDtoA((CUarray) dstPtr         , destOffset,
                                     *((CUdeviceptr*) srcPtr) + srcOffset,
                                     bytes_) );
    }
    else{
      if(dest->isTexture)
        OCCA_CUDA_CHECK("Memory: Copy To [Texture -> Memory]",
                        cuMemcpyAtoD(*((CUdeviceptr*) dstPtr) + destOffset,
                                     (CUarray) srcPtr         , srcOffset,
                                     bytes_) );
      else
        OCCA_CUDA_CHECK("Memory: Copy To [Texture -> Texture]",
                        cuMemcpyAtoA((CUarray) dstPtr, destOffset,
                                     (CUarray) srcPtr, srcOffset,
                                     bytes_) );
    }
  }
Example #8
0
pocl_cuda_write (void *data, const void *host_ptr, void *device_ptr,
                 size_t offset, size_t cb)
{
  CUresult result
      = cuMemcpyHtoD ((CUdeviceptr) (device_ptr + offset), host_ptr, cb);
  CUDA_CHECK (result, "cuMemcpyHtoD");
}

void
pocl_cuda_copy (void *data, const void *src_ptr, size_t src_offset,
                void *__restrict__ dst_ptr, size_t dst_offset, size_t cb)
{
  if (src_ptr == dst_ptr)
    return;

  CUresult result = cuMemcpyDtoD ((CUdeviceptr) (dst_ptr + dst_offset),
                                  (CUdeviceptr) (src_ptr + src_offset), cb);
  CUDA_CHECK (result, "cuMemcpyDtoD");
}

void
pocl_cuda_read_rect (void *data, void *__restrict__ const host_ptr,
                     void *__restrict__ const device_ptr,
                     const size_t *__restrict__ const buffer_origin,
                     const size_t *__restrict__ const host_origin,
                     const size_t *__restrict__ const region,
                     size_t const buffer_row_pitch,
                     size_t const buffer_slice_pitch,
                     size_t const host_row_pitch,
                     size_t const host_slice_pitch)
{
  CUDA_MEMCPY3D params = { 0 };
Example #9
0
int main() {
  CU_ERROR_CHECK(cuInit(0));

  int count;
  CU_ERROR_CHECK(cuDeviceGetCount(&count));

  count = (count > 2) ? 2 : count;

  CUdevice devices[count];
  for (int i = 0; i < count; i++)
    CU_ERROR_CHECK(cuDeviceGet(&devices[i], i));

  // Question 1:  Can you create multiple contexts on the same device?
  {
    fprintf(stderr, "Attempting to create multiple contexts on each device...\n");
    CUcontext contexts[count * N];
    size_t j = 0;
    for (int i = 0; i < count; i++) {
      CUresult error = CUDA_SUCCESS;
      size_t k;
      for (k = 0; k < N && error == CUDA_SUCCESS; k++) {
        error = cuCtxCreate(&contexts[j], CU_CTX_SCHED_AUTO, devices[i]);
        if (error == CUDA_SUCCESS)
          CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[j++]));
      }
      fprintf(stderr, "  created %zu contexts on device %d before cuCtxCreate returned \"%s\"\n", (k - 1), i, cuGetErrorString(error));
    }

    CUresult error = CUDA_SUCCESS;
    size_t k;
    for (k = 0; k < j && error == CUDA_SUCCESS; k++)
      error = cuCtxPushCurrent(contexts[k]);
    if (error == CUDA_SUCCESS)
      fprintf(stderr, "  successfully pushed %zu contexts with cuCtxPushCurrent\n", k);
    else
      fprintf(stderr, "  pushed %zu contexts before cuCtxPushCurrent returned \"%s\"\n", (k - 1), cuGetErrorString(error));

    for (size_t k = 0; k < j; k++)
      CU_ERROR_CHECK(cuCtxDestroy(contexts[k]));

    fprintf(stderr, "\n");
  }

  CUcontext contexts[count][2];
  for (int i = 0; i < count; i++) {
    for (size_t j = 0; j < 2; j++) {
      CU_ERROR_CHECK(cuCtxCreate(&contexts[i][j], CU_CTX_SCHED_AUTO, devices[i]));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[i][j]));
    }
  }

  // Question 2:  Can you access a host pointer in a different context from
  // which it was created?
  // Question 3:  Can you free a host pointer in a different context from which
  // it was created?
  {
    void * hPtr;
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuMemAllocHost(&hPtr, 1024));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));

    CUdeviceptr dPtr[count];
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
    CU_ERROR_CHECK(cuMemAlloc(&dPtr[0], 1024)); // Different context, same device
    fprintf(stderr, "Accessing a host pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(cuMemcpyHtoD(dPtr[0], hPtr, 1024)));
    CU_ERROR_CHECK(cuMemFree(dPtr[0]));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    if (count > 1) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      CU_ERROR_CHECK(cuMemAlloc(&dPtr[1], 1024)); // Different context, different device
      fprintf(stderr, "Accessing a host pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(cuMemcpyHtoD(dPtr[1], hPtr, 1024)));
    CU_ERROR_CHECK(cuMemFree(dPtr[1]));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }

    fprintf(stderr, "\n");

    CUresult error = CUDA_ERROR_UNKNOWN;
    if (count > 1) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      error = cuMemFreeHost(hPtr);
      fprintf(stderr, "Freeing a host pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      error = cuMemFreeHost(hPtr);
      fprintf(stderr, "Freeing a host pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      error = cuMemFreeHost(hPtr);
      fprintf(stderr, "Freeing a host pointer from the same context to which it was allocated returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    fprintf(stderr, "\n");
  }

  // Question 4:  Can you access a device pointer in a different context from
  // which it was created?
  // Question 5:  Can you free a device pointer in a different context from which
  // it was created?
  {
    CUdeviceptr dPtr[count][2];
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuMemAlloc(&dPtr[0][0], 1024));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
    CU_ERROR_CHECK(cuMemAlloc(&dPtr[0][1], 1024));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));

    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
    fprintf(stderr, "Accessing a device pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(cuMemcpyDtoD(dPtr[0][0], dPtr[0][1], 1024)));
    CU_ERROR_CHECK(cuMemFree(dPtr[0][1]));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));

    if (count > 1) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      CU_ERROR_CHECK(cuMemAlloc(&dPtr[1][0], 1024)); // Different context, different device
      fprintf(stderr, "Accessing a device pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(cuMemcpyDtoD(dPtr[0][0], dPtr[1][0], 1024)));
      CU_ERROR_CHECK(cuMemFree(dPtr[1][0]));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }

    fprintf(stderr, "\n");

    CUresult error = CUDA_ERROR_UNKNOWN;
    if (count > 1) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      error = cuMemFree(dPtr[0][0]);
      fprintf(stderr, "Freeing a device pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      error = cuMemFree(dPtr[0][0]);
      fprintf(stderr, "Freeing a device pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      error = cuMemFree(dPtr[0][0]);
      fprintf(stderr, "Freeing a device pointer from the same context to which it was allocated returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    fprintf(stderr, "\n");
  }

  // Question 6:  Can you access a module in a different context from which it
  // was loaded?
  // Question 7:  Can you unload a module in a different context from which it
  // was loaded?
  {
    CUmodule module;
    CUdeviceptr ptr;
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuModuleLoad(&module,  "kernel-test.ptx"));
    CU_ERROR_CHECK(cuMemAlloc(&ptr, sizeof(float)));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));

    CUfunction function = 0;
    if (count > 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      fprintf(stderr, "Getting a function pointer from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel")));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (function == 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      fprintf(stderr, "Getting a function pointer from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel")));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (function == 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      fprintf(stderr, "Getting a function pointer from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel")));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    fprintf(stderr, "\n");

    CUdeviceptr a, b;
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuMemAlloc(&a, sizeof(float)));
    CU_ERROR_CHECK(cuMemAlloc(&b, sizeof(float)));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    void * params[] = { &a, & b };

    CUresult error = CUDA_ERROR_UNKNOWN;
    if (count > 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      fprintf(stderr, "Launching a function from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      fprintf(stderr, "Launching a function from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      fprintf(stderr, "Launching a function from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    fprintf(stderr, "\n");

    error = CUDA_ERROR_UNKNOWN;
    if (count > 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      fprintf(stderr, "Unloading a module from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      fprintf(stderr, "Unloading a module from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      fprintf(stderr, "Unloading a module from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuMemFree(a));
    CU_ERROR_CHECK(cuMemFree(b));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
  }

  for (int i = 0; i < count; i++) {
    for (size_t j = 0; j < 2; j++)
      CU_ERROR_CHECK(cuCtxDestroy(contexts[i][j]));
  }

  return 0;
}
Example #10
0
CUresult CuDeviceMem::FromDevice(CUdeviceptr source) {
	CUresult result = cuMemcpyDtoD(_deviceptr, source, _size);
	return result;
}
Example #11
0
CUresult CuDeviceMem::ToDevice(size_t sourceOffset, CUdeviceptr target,
	size_t size) {
	CUresult result = cuMemcpyDtoD(target, _deviceptr + sourceOffset, size);
	return result;
}
Example #12
0
CUresult CuDeviceMem::ToDevice(CUdeviceptr target) {
	CUresult result = cuMemcpyDtoD(target, _deviceptr, _size);
	return result;
}