/** Documented at declaration */ int gpujpeg_coder_deinit(struct gpujpeg_coder* coder) { if ( coder->data_raw != NULL ) cudaFreeHost(coder->data_raw); if ( coder->d_data_raw != NULL ) cudaFree(coder->d_data_raw); if ( coder->d_data != NULL ) cudaFree(coder->d_data); if ( coder->data_quantized != NULL ) cudaFreeHost(coder->data_quantized); if ( coder->d_data_quantized != NULL ) cudaFree(coder->d_data_quantized); if ( coder->data_compressed != NULL ) cudaFreeHost(coder->data_compressed); if ( coder->d_data_compressed != NULL ) cudaFree(coder->d_data_compressed); if ( coder->segment != NULL ) cudaFreeHost(coder->segment); if ( coder->d_segment != NULL ) cudaFree(coder->d_segment); if ( coder->d_temp_huffman != NULL ) cudaFree(coder->d_temp_huffman); if ( coder->block_list != NULL ) cudaFreeHost(coder->block_list); if ( coder->d_block_list != NULL ) cudaFree(coder->d_block_list); return 0; }
static void loadOprodFromCPUArrayQuda(void *cudaOprodEven, void *cudaOprodOdd, void *cpuOprod, size_t bytes, int Vh) { // Use pinned memory float2 *packedEven, *packedOdd; checkCudaError(); if (cudaMallocHost(&packedEven, bytes) == cudaErrorMemoryAllocation) { errorQuda("ERROR: cudaMallocHost failed for packedEven\n"); } if (cudaMallocHost(&packedOdd, bytes) == cudaErrorMemoryAllocation) { errorQuda("ERROR: cudaMallocHost failed for packedEven\n"); } checkCudaError(); packOprodField(packedEven, (float*)cpuOprod, 0, Vh); packOprodField(packedOdd, (float*)cpuOprod, 1, Vh); checkCudaError(); cudaMemset(cudaOprodEven, 0, bytes); cudaMemset(cudaOprodOdd, 0, bytes); checkCudaError(); cudaMemcpy(cudaOprodEven, packedEven, bytes, cudaMemcpyHostToDevice); checkCudaError(); cudaMemcpy(cudaOprodOdd, packedOdd, bytes, cudaMemcpyHostToDevice); checkCudaError(); cudaFreeHost(packedEven); cudaFreeHost(packedOdd); }
static void copyOprodFromCPUArrayQuda(FullOprod cudaOprod, void *cpuOprod, size_t bytes_per_dir, int Vh) { // Use pinned memory float2 *packedEven, *packedOdd; if(cudaMallocHost(&packedEven, bytes_per_dir) == cudaErrorMemoryAllocation) { errorQuda("ERROR: cudaMallocHost failed for packedEven\n"); } if (cudaMallocHost(&packedOdd, bytes_per_dir) == cudaErrorMemoryAllocation) { errorQuda("ERROR: cudaMallocHost failed for packedOdd\n"); } for(int dir=0; dir<4; dir++){ packOprodFieldDir(packedEven, (float*)cpuOprod, dir, 0, Vh); packOprodFieldDir(packedOdd, (float*)cpuOprod, dir, 1, Vh); cudaMemset(cudaOprod.even.data[dir], 0, bytes_per_dir); cudaMemset(cudaOprod.odd.data[dir], 0, bytes_per_dir); checkCudaError(); cudaMemcpy(cudaOprod.even.data[dir], packedEven, bytes_per_dir, cudaMemcpyHostToDevice); cudaMemcpy(cudaOprod.odd.data[dir], packedOdd, bytes_per_dir, cudaMemcpyHostToDevice); checkCudaError(); } cudaFreeHost(packedEven); cudaFreeHost(packedOdd); }
void loadParityClover(ParityClover ret, void *clover, QudaPrecision cpu_prec, CloverFieldOrder clover_order) { // use pinned memory void *packedClover, *packedCloverNorm; if (ret.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) { errorQuda("Cannot have CUDA double precision without CPU double precision"); } if (clover_order != QUDA_PACKED_CLOVER_ORDER) { errorQuda("Invalid clover_order"); } #ifndef __DEVICE_EMULATION__ if (cudaMallocHost(&packedClover, ret.bytes) == cudaErrorMemoryAllocation) { errorQuda("Error allocating clover pinned memory"); } if (ret.precision == QUDA_HALF_PRECISION) if (cudaMallocHost(&packedCloverNorm, ret.bytes/18) == cudaErrorMemoryAllocation) { errorQuda("Error allocating clover pinned memory"); } #else packedClover = malloc(ret.bytes); if (ret.precision == QUDA_HALF_PRECISION) packedCloverNorm = malloc(ret.bytes/18); #endif if (ret.precision == QUDA_DOUBLE_PRECISION) { packParityClover((double2 *)packedClover, (double *)clover, ret.volume, ret.pad); } else if (ret.precision == QUDA_SINGLE_PRECISION) { if (cpu_prec == QUDA_DOUBLE_PRECISION) { packParityClover((float4 *)packedClover, (double *)clover, ret.volume, ret.pad); } else { packParityClover((float4 *)packedClover, (float *)clover, ret.volume, ret.pad); } } else { if (cpu_prec == QUDA_DOUBLE_PRECISION) { packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, (double *)clover, ret.volume, ret.pad); } else { packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, (float *)clover, ret.volume, ret.pad); } } cudaMemcpy(ret.clover, packedClover, ret.bytes, cudaMemcpyHostToDevice); if (ret.precision == QUDA_HALF_PRECISION) { cudaMemcpy(ret.cloverNorm, packedCloverNorm, ret.bytes/18, cudaMemcpyHostToDevice); } #ifndef __DEVICE_EMULATION__ cudaFreeHost(packedClover); if (ret.precision == QUDA_HALF_PRECISION) cudaFreeHost(packedCloverNorm); #else free(packedClover); if (ret.precision == QUDA_HALF_PRECISION) free(packedCloverNorm); #endif }
void free_data_arr(DataArray* data_arr) { cudaFreeHost(*(data_arr->data_r)); printf("host r space freed\n"); cudaFreeHost(*(data_arr->data_k)); printf("host k space freed\n"); // cudaFree(*(data_arr->data_r_dev)); // cudaDeviceSynchronize(); // printf("device r space freed\n"); // cudaFree(*(data_arr->data_k_dev)); // cudaDeviceSynchronize(); // printf("device k space freed\n"); }
void cudaCloverField::loadFullField(void *even, void *evenNorm, void *odd, void *oddNorm, const void *h_clover, const QudaPrecision cpu_prec, const CloverFieldOrder cpu_order) { // use pinned memory void *packedEven, *packedEvenNorm, *packedOdd, *packedOddNorm; if (precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) { errorQuda("Cannot have CUDA double precision without CPU double precision"); } if (cpu_order != QUDA_LEX_PACKED_CLOVER_ORDER) { errorQuda("Invalid clover order"); } cudaMallocHost(&packedEven, bytes/2); cudaMallocHost(&packedOdd, bytes/2); if (precision == QUDA_HALF_PRECISION) { cudaMallocHost(&packedEvenNorm, norm_bytes/2); cudaMallocHost(&packedOddNorm, norm_bytes/2); } if (precision == QUDA_DOUBLE_PRECISION) { packFullClover((double2 *)packedEven, (double2 *)packedOdd, (double *)clover, x, pad); } else if (precision == QUDA_SINGLE_PRECISION) { if (cpu_prec == QUDA_DOUBLE_PRECISION) { packFullClover((float4 *)packedEven, (float4 *)packedOdd, (double *)clover, x, pad); } else { packFullClover((float4 *)packedEven, (float4 *)packedOdd, (float *)clover, x, pad); } } else { if (cpu_prec == QUDA_DOUBLE_PRECISION) { packFullCloverHalf((short4 *)packedEven, (float *)packedEvenNorm, (short4 *)packedOdd, (float *) packedOddNorm, (double *)clover, x, pad); } else { packFullCloverHalf((short4 *)packedEven, (float *)packedEvenNorm, (short4 *)packedOdd, (float * )packedOddNorm, (float *)clover, x, pad); } } cudaMemcpy(even, packedEven, bytes/2, cudaMemcpyHostToDevice); cudaMemcpy(odd, packedOdd, bytes/2, cudaMemcpyHostToDevice); if (precision == QUDA_HALF_PRECISION) { cudaMemcpy(evenNorm, packedEvenNorm, norm_bytes/2, cudaMemcpyHostToDevice); cudaMemcpy(oddNorm, packedOddNorm, norm_bytes/2, cudaMemcpyHostToDevice); } cudaFreeHost(packedEven); cudaFreeHost(packedOdd); if (precision == QUDA_HALF_PRECISION) { cudaFreeHost(packedEvenNorm); cudaFreeHost(packedOddNorm); } }
void MFNHashTypePlainCUDA::freeThreadAndDeviceMemory() { trace_printf("MFNHashTypePlainCUDA::freeThreadAndDeviceMemory()\n"); cudaError_t err; // Free all the memory, then look for errors. cudaFree((void *)this->DeviceHashlistAddress); cudaFreeHost((void *)this->HostSuccessAddress); delete[] this->HostSuccessReportedAddress; // Only cudaFree if zeroCopy is in use. if (!this->useZeroCopy) { cudaFree((void *)this->DeviceSuccessAddress); cudaFree((void *)this->DeviceFoundPasswordsAddress); } cudaFreeHost((void *)this->HostFoundPasswordsAddress); cudaFreeHost((void*)this->HostStartPointAddress); cudaFree((void *)this->DeviceStartPointAddress); cudaFree((void *)this->DeviceStartPasswords32Address); // Only free the bitmap memory if it has been allocated. if (this->DeviceBitmap128mb_a_Address) { cudaFree((void *)this->DeviceBitmap128mb_a_Address); this->DeviceBitmap128mb_a_Address = 0; } if (this->DeviceBitmap128mb_b_Address) { cudaFree((void *)this->DeviceBitmap128mb_b_Address); this->DeviceBitmap128mb_b_Address = 0; } if (this->DeviceBitmap128mb_c_Address) { cudaFree((void *)this->DeviceBitmap128mb_c_Address); this->DeviceBitmap128mb_c_Address = 0; } if (this->DeviceBitmap128mb_d_Address) { cudaFree((void *)this->DeviceBitmap128mb_d_Address); this->DeviceBitmap128mb_d_Address = 0; } // Get any error that occurred above and report it. err = cudaGetLastError(); if (err != cudaSuccess) { printf("Thread %d: CUDA error freeing memory: %s. Exiting.\n", this->threadId, cudaGetErrorString( err)); exit(1); } }
void GRTRegenerateChains::FreePerGPUMemory(GRTRegenerateThreadRunData *data) { CH_CUDA_SAFE_CALL(cudaFree(this->DEVICE_Hashes[data->threadID])); CH_CUDA_SAFE_CALL(cudaFreeHost(this->HOST_Success[data->threadID])); CH_CUDA_SAFE_CALL(cudaFreeHost(this->HOST_Passwords[data->threadID])); // Only free the device memory if zero copy was NOT used if (!this->CommandLineData->GetUseZeroCopy()) { CH_CUDA_SAFE_CALL(cudaFree(this->DEVICE_Passwords[data->threadID])); CH_CUDA_SAFE_CALL(cudaFree(this->DEVICE_Success[data->threadID])); } delete[] this->HOST_Success_Reported[data->threadID]; //printf("Memory for thread %d freed.\n", data->threadID); }
OpenSteer::MemoryBackend::~MemoryBackend() { std::cout << "MemoryBackend reset" << std::endl; if (_data != 0) { cudaFreeHost(_data); } if (_const != 0) { cudaFreeHost(_const); } _data = 0; _const = 0; _instance = 0; _idCounter = 0; }
void loadMomField(Float2 *even, Float2 *odd, Float *mom, int bytes, int Vh, int pad) { Float2 *packedEven, *packedOdd; cudaMallocHost(&packedEven, bytes/2); cudaMallocHost(&packedOdd, bytes/2); packMomField(packedEven, (Float*)mom, 0, Vh, pad); packMomField(packedOdd, (Float*)mom, 1, Vh, pad); cudaMemcpy(even, packedEven, bytes/2, cudaMemcpyHostToDevice); cudaMemcpy(odd, packedOdd, bytes/2, cudaMemcpyHostToDevice); cudaFreeHost(packedEven); cudaFreeHost(packedOdd); }
/** * destructor */ virtual ~MappedBufferIntern() { __startOperation(ITask::TASK_CUDA); __startOperation(ITask::TASK_HOST); if (pointer && ownPointer) { #if( PMACC_CUDA_ENABLED == 1 ) /* cupla 0.1.0 does not support the function cudaHostAlloc to create mapped memory. * Therefore we need to call the native CUDA function cudaFreeHost to free memory. * Due to the renaming of cuda functions with cupla via macros we need to remove * the renaming to get access to the native cuda function. * @todo this is a workaround please fix me. We need to investigate if * it is possible to have mapped/unified memory in alpaka. * * corresponding alpaka issues: * https://github.com/ComputationalRadiationPhysics/alpaka/issues/296 * https://github.com/ComputationalRadiationPhysics/alpaka/issues/612 */ # undef cudaFreeHost CUDA_CHECK((cuplaError_t)cudaFreeHost(pointer)); // re-introduce the cupla macro # define cudaFreeHost(...) cuplaFreeHost(__VA_ARGS__) #else __deleteArray(pointer); #endif } }
TEST_P(MemcpyAsync, H2DTransfers) { const size_t param = GetParam(); const size_t alloc = 1 << param; cudaError_t ret; void *d1, *h1; ret = cudaMalloc(&d1, alloc); ASSERT_EQ(cudaSuccess, ret); ret = cudaHostAlloc(&h1, alloc, cudaHostAllocMapped); ASSERT_EQ(cudaSuccess, ret); cudaStream_t stream; ret = cudaStreamCreate(&stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(d1, h1, alloc, cudaMemcpyHostToDevice, stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamSynchronize(stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaFree(d1); ASSERT_EQ(cudaSuccess, ret); ret = cudaFreeHost(h1); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamDestroy(stream); ASSERT_EQ(cudaSuccess, ret); }
/** * destructor */ virtual ~HostBufferIntern() throw (std::runtime_error) { if (pointer && ownPointer) { CUDA_CHECK(cudaFreeHost(pointer)); } }
void allocate(std::size_t size_,std::size_t nstreams , nt2::host_ &) { if(size_ > size) { if(size != 0) { for(std::size_t i =0; i < device.size(); ++i) { CUDA_ERROR(cudaFreeHost(host_pinned[i])); CUDA_ERROR(cudaFree(device[i])); } } ns = nstreams; size = size_; std::size_t sizeof_ = size*sizeof(T); host_pinned.resize(nstreams); device.resize(nstreams); for(std::size_t i =0; i < nstreams; ++i) { CUDA_ERROR(cudaMallocHost( (void**)&host_pinned[i] , sizeof_ )); CUDA_ERROR(cudaMalloc((void**)&device[i] , sizeof_ )); } } }
/*! \brief Destroy distributed L & U matrices. */ void Destroy_LU(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct) { int_t i, nb, nsupers; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; #if ( DEBUGlevel>=1 ) int iam; MPI_Comm_rank( MPI_COMM_WORLD, &iam ); CHECK_MALLOC(iam, "Enter Destroy_LU()"); #endif nsupers = Glu_persist->supno[n-1] + 1; nb = CEILING(nsupers, grid->npcol); for (i = 0; i < nb; ++i) if ( Llu->Lrowind_bc_ptr[i] ) { SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]); #ifdef GPU_ACC checkCuda(cudaFreeHost(Llu->Lnzval_bc_ptr[i])); #else SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]); #endif } SUPERLU_FREE (Llu->Lrowind_bc_ptr); SUPERLU_FREE (Llu->Lnzval_bc_ptr); nb = CEILING(nsupers, grid->nprow); for (i = 0; i < nb; ++i) if ( Llu->Ufstnz_br_ptr[i] ) { SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]); SUPERLU_FREE (Llu->Unzval_br_ptr[i]); } SUPERLU_FREE (Llu->Ufstnz_br_ptr); SUPERLU_FREE (Llu->Unzval_br_ptr); /* The following can be freed after factorization. */ SUPERLU_FREE(Llu->ToRecv); SUPERLU_FREE(Llu->ToSendD); SUPERLU_FREE(Llu->ToSendR[0]); SUPERLU_FREE(Llu->ToSendR); /* The following can be freed only after iterative refinement. */ SUPERLU_FREE(Llu->ilsum); SUPERLU_FREE(Llu->fmod); SUPERLU_FREE(Llu->fsendx_plist[0]); SUPERLU_FREE(Llu->fsendx_plist); SUPERLU_FREE(Llu->bmod); SUPERLU_FREE(Llu->bsendx_plist[0]); SUPERLU_FREE(Llu->bsendx_plist); SUPERLU_FREE(Llu->mod_bit); SUPERLU_FREE(Glu_persist->xsup); SUPERLU_FREE(Glu_persist->supno); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit Destroy_LU()"); #endif }
pinned_mem_pool::~pinned_mem_pool() { if (mem_) { cudaFreeHost(mem_); mem_ = NULL; } }
void pinned_mem_pool::destroy() { if (mem_) { cudaFreeHost(mem_); mem_ = NULL; } }
inline void CaffeFreeHost(void* ptr, bool use_cuda) { #ifndef CPU_ONLY if (use_cuda) { CUDA_CHECK(cudaFreeHost(ptr)); return; } #endif #ifdef USE_MLSL if (mn::is_multinode()) { mn::free(ptr); } else { #endif /* !USE_MLSL */ #ifdef USE_MKL mkl_free(ptr); #else free(ptr); #endif #ifdef USE_MLSL } #endif /* USE_MLSL */ }
TEST(HostAlloc, MappedPointer) { cudaError_t ret; int device; ret = cudaGetDevice(&device); ASSERT_EQ(cudaSuccess, ret); struct cudaDeviceProp prop; ret = cudaGetDeviceProperties(&prop, device); ASSERT_EQ(cudaSuccess, ret); void * ptr; ret = cudaHostAlloc(&ptr, 4, cudaHostAllocMapped); ASSERT_EQ(cudaSuccess, ret); /* * Try to retrieve the device pointer, expecting a result according to * prop.canMapHostMemory. */ void * device_ptr; ret = cudaHostGetDevicePointer(&device_ptr, ptr, 0); if (prop.canMapHostMemory) { EXPECT_EQ(cudaSuccess, ret); EXPECT_FALSE(device_ptr == NULL); } else { EXPECT_EQ(cudaErrorMemoryAllocation, ret); } ret = cudaFreeHost(ptr); ASSERT_EQ(cudaSuccess, ret); }
/** Documented at declaration */ int gpujpeg_image_destroy(uint8_t* image) { cudaFreeHost(image); return 0; }
bool DumpIntegerMemoryDataSet(char *name, unsigned int* device_values, int nb_data ){ // // ON ALLOUE LA ZONE MEMOIRE POUR RECUPERE LES DONNEES PROVENANT DU GPU // printf("(II) DumpFloatMemoryDataSet(%s, %p, %d)\n", name, device_values, nb_data); cudaError_t Status; unsigned int* host_values; CUDA_MALLOC_HOST(&host_values, nb_data, __FILE__, __LINE__); Status = cudaMemcpy(host_values, device_values, nb_data * sizeof(unsigned int), cudaMemcpyDeviceToHost); if(Status != cudaSuccess) { printf("\n1 %s\n", cudaGetErrorString(Status)); } PrintIntegerMatrix(name, host_values, nb_data, 8); // PrintIntegerMatrix(name, host_values, nb_data); Status = cudaFreeHost(host_values); if(Status != cudaSuccess) { printf("\n1 %s\n", cudaGetErrorString(Status)); } return true; }
void storeMomToCPUArray(Float* mom, Float2 *even, Float2 *odd, int bytes, int V, int pad) { Float2 *packedEven, *packedOdd; cudaMallocHost(&packedEven, bytes/2); cudaMallocHost(&packedOdd, bytes/2); cudaMemcpy(packedEven, even, bytes/2, cudaMemcpyDeviceToHost); cudaMemcpy(packedOdd, odd, bytes/2, cudaMemcpyDeviceToHost); unpackMomField((Float*)mom, packedEven,0, V/2, pad); unpackMomField((Float*)mom, packedOdd, 1, V/2, pad); cudaFreeHost(packedEven); cudaFreeHost(packedOdd); }
static void storeGaugeField(Float *cpuGauge, FloatN *gauge, GaugeFieldOrder cpu_order, QudaReconstructType reconstruct, int bytes, int volumeCB, int pad) { // Use pinned memory FloatN *packed; cudaMallocHost(&packed, bytes); cudaMemcpy(packed, gauge, bytes, cudaMemcpyDeviceToHost); FloatN *packedEven = packed; FloatN *packedOdd = (FloatN*)((char*)packed + bytes/2); if (cpu_order == QUDA_QDP_GAUGE_ORDER) { unpackQDPGaugeField((Float**)cpuGauge, packedEven, 0, reconstruct, volumeCB, pad); unpackQDPGaugeField((Float**)cpuGauge, packedOdd, 1, reconstruct, volumeCB, pad); } else if (cpu_order == QUDA_CPS_WILSON_GAUGE_ORDER) { unpackCPSGaugeField((Float*)cpuGauge, packedEven, 0, reconstruct, volumeCB, pad); unpackCPSGaugeField((Float*)cpuGauge, packedOdd, 1, reconstruct, volumeCB, pad); } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) { unpackMILCGaugeField((Float*)cpuGauge, packedEven, 0, reconstruct, volumeCB, pad); unpackMILCGaugeField((Float*)cpuGauge, packedOdd, 1, reconstruct, volumeCB, pad); } else { errorQuda("Invalid gauge_order"); } cudaFreeHost(packed); }
void cudaCloverField::loadParityField(void *clover, void *cloverNorm, const void *h_clover, const QudaPrecision cpu_prec, const CloverFieldOrder cpu_order) { // use pinned memory void *packedClover, *packedCloverNorm; if (precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) { errorQuda("Cannot have CUDA double precision without CPU double precision"); } if (cpu_order != QUDA_PACKED_CLOVER_ORDER && cpu_order != QUDA_BQCD_CLOVER_ORDER) errorQuda("Invalid clover order %d", cpu_order); if (cudaMallocHost(&packedClover, bytes/2) == cudaErrorMemoryAllocation) errorQuda("Error allocating clover pinned memory"); if (precision == QUDA_HALF_PRECISION) { if (cudaMallocHost(&packedCloverNorm, norm_bytes/2) == cudaErrorMemoryAllocation) { errorQuda("Error allocating clover pinned memory"); } } if (precision == QUDA_DOUBLE_PRECISION) { packParityClover((double2 *)packedClover, (double *)h_clover, volumeCB, pad, cpu_order); } else if (precision == QUDA_SINGLE_PRECISION) { if (cpu_prec == QUDA_DOUBLE_PRECISION) { packParityClover((float4 *)packedClover, (double *)h_clover, volumeCB, pad, cpu_order); } else { packParityClover((float4 *)packedClover, (float *)h_clover, volumeCB, pad, cpu_order); } } else { if (cpu_prec == QUDA_DOUBLE_PRECISION) { packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, (double *)h_clover, volumeCB, pad, cpu_order); } else { packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, (float *)h_clover, volumeCB, pad, cpu_order); } } cudaMemcpy(clover, packedClover, bytes/2, cudaMemcpyHostToDevice); if (precision == QUDA_HALF_PRECISION) cudaMemcpy(cloverNorm, packedCloverNorm, norm_bytes/2, cudaMemcpyHostToDevice); cudaFreeHost(packedClover); if (precision == QUDA_HALF_PRECISION) cudaFreeHost(packedCloverNorm); }
Mesh::~Mesh() { cudaFreeHost(coords_pinned); cudaFreeHost(ENList_pinned); cudaFreeHost(metric_pinned); cudaFreeHost(normals_pinned); cudaFreeHost(NNListArray_pinned); cudaFreeHost(NNListIndex_pinned); cudaFreeHost(NEListArray_pinned); cudaFreeHost(NEListIndex_pinned); }
void aligned_free(void *ptr, const size_t sz) { #if 1 //munlock(ptr, sz); free(ptr); #else cudaFreeHost(&ptr); #endif }
void cuda_hostfree(void* ptr) { struct cuda_mem_s* nptr = search(ptr, true); assert(nptr->ptr == ptr); assert(!nptr->device); free(nptr); cudaFreeHost(ptr); }
TEST(MemcpyAsync, Pinned) { /** * Host memory must be pinned in order to be used as an argument to * cudaMemcpyAsync. Panoptes only prints a warning about this error * rather than actually return an error via the CUDA API. This test is * written as to check for the absence of an error once the CUDA * implementation starts returning one for nonpinned host memory. */ const long page_size_ = sysconf(_SC_PAGESIZE); ASSERT_LT(0, page_size_); const size_t page_size = page_size_; const size_t pages = 3; assert(pages > 0); cudaError_t ret; cudaStream_t stream; uint8_t *device_ptr, *host_ptr; ret = cudaMalloc((void **) &device_ptr, pages * page_size); ASSERT_EQ(cudaSuccess, ret); ret = cudaMallocHost((void **) &host_ptr, pages * page_size); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamCreate(&stream); ASSERT_EQ(cudaSuccess, ret); /* Page aligned transfers */ for (size_t i = 0; i < pages; i++) { for (size_t j = i; j < pages; j++) { ret = cudaMemcpyAsync(device_ptr, host_ptr + i * page_size, (pages - j) * page_size, cudaMemcpyHostToDevice, stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(host_ptr + i * page_size, device_ptr, (pages - j) * page_size, cudaMemcpyDeviceToHost, stream); EXPECT_EQ(cudaSuccess, ret); } } /* Try a nonaligned transfer. */ ret = cudaMemcpyAsync(device_ptr, host_ptr + (page_size / 2), page_size / 2, cudaMemcpyHostToDevice, stream); ret = cudaStreamSynchronize(stream); EXPECT_EQ(cudaSuccess, ret); ret = cudaStreamDestroy(stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaFreeHost(host_ptr); ASSERT_EQ(cudaSuccess, ret); ret = cudaFree(device_ptr); ASSERT_EQ(cudaSuccess, ret); }
void DestroySites() { glDeleteBuffersARB(1, &vboId); glDeleteBuffersARB(1, &colorboId); delete[] site_list_x; delete[] site_list_x_bar; cudaFreeHost(site_list); }
inline void deallocate(void* ptr, size_t) { cudaError_t error = cudaFreeHost(ptr); if(error != cudaSuccess) { throw thrust::system_error(error, thrust::cuda_category(), "pinned_resource::deallocate(): cudaFree"); } }