コード例 #1
0
/** Documented at declaration */
int
gpujpeg_coder_deinit(struct gpujpeg_coder* coder)
{
    if ( coder->data_raw != NULL )
        cudaFreeHost(coder->data_raw);
    if ( coder->d_data_raw != NULL )
        cudaFree(coder->d_data_raw);
    if ( coder->d_data != NULL )
        cudaFree(coder->d_data);
    if ( coder->data_quantized != NULL )
        cudaFreeHost(coder->data_quantized);
    if ( coder->d_data_quantized != NULL )
        cudaFree(coder->d_data_quantized);
    if ( coder->data_compressed != NULL )
        cudaFreeHost(coder->data_compressed);
    if ( coder->d_data_compressed != NULL )
        cudaFree(coder->d_data_compressed);
    if ( coder->segment != NULL )
        cudaFreeHost(coder->segment);
    if ( coder->d_segment != NULL )
        cudaFree(coder->d_segment);
    if ( coder->d_temp_huffman != NULL )
        cudaFree(coder->d_temp_huffman);
    if ( coder->block_list != NULL )
        cudaFreeHost(coder->block_list);
    if ( coder->d_block_list != NULL )
        cudaFree(coder->d_block_list);
    return 0;
}
コード例 #2
0
ファイル: hisq_force_utils.cpp プロジェクト: kpetrov/quda
    static void 
      loadOprodFromCPUArrayQuda(void *cudaOprodEven, void *cudaOprodOdd, void *cpuOprod,
          size_t bytes, int Vh)
      {
        // Use pinned memory 
	float2 *packedEven, *packedOdd;
        checkCudaError();

        if (cudaMallocHost(&packedEven, bytes) == cudaErrorMemoryAllocation) {
	  errorQuda("ERROR: cudaMallocHost failed for packedEven\n");
	}
        if (cudaMallocHost(&packedOdd, bytes) == cudaErrorMemoryAllocation) {
	  errorQuda("ERROR: cudaMallocHost failed for packedEven\n");
	}
        checkCudaError();


        packOprodField(packedEven, (float*)cpuOprod, 0, Vh);
        packOprodField(packedOdd,  (float*)cpuOprod, 1, Vh);
        checkCudaError();


        cudaMemset(cudaOprodEven, 0, bytes);
        cudaMemset(cudaOprodOdd, 0, bytes);
        checkCudaError();

        cudaMemcpy(cudaOprodEven, packedEven, bytes, cudaMemcpyHostToDevice);
        checkCudaError();
        cudaMemcpy(cudaOprodOdd, packedOdd, bytes, cudaMemcpyHostToDevice);
        checkCudaError();

        cudaFreeHost(packedEven);
        cudaFreeHost(packedOdd);
      }
コード例 #3
0
ファイル: hisq_force_utils.cpp プロジェクト: kpetrov/quda
    static void
      copyOprodFromCPUArrayQuda(FullOprod cudaOprod, void *cpuOprod,
          size_t bytes_per_dir, int Vh)
      {
        // Use pinned memory 
        float2 *packedEven, *packedOdd;
        if(cudaMallocHost(&packedEven, bytes_per_dir) == cudaErrorMemoryAllocation) {
	  errorQuda("ERROR: cudaMallocHost failed for packedEven\n");
	}
        if (cudaMallocHost(&packedOdd, bytes_per_dir) == cudaErrorMemoryAllocation) {
	  errorQuda("ERROR: cudaMallocHost failed for packedOdd\n");
	}

        for(int dir=0; dir<4; dir++){
          packOprodFieldDir(packedEven, (float*)cpuOprod, dir, 0, Vh);
          packOprodFieldDir(packedOdd,  (float*)cpuOprod, dir, 1, Vh);

          cudaMemset(cudaOprod.even.data[dir], 0, bytes_per_dir);
          cudaMemset(cudaOprod.odd.data[dir],  0, bytes_per_dir);
          checkCudaError();

          cudaMemcpy(cudaOprod.even.data[dir], packedEven, bytes_per_dir, cudaMemcpyHostToDevice);
          cudaMemcpy(cudaOprod.odd.data[dir], packedOdd, bytes_per_dir, cudaMemcpyHostToDevice);
          checkCudaError();
        }
        cudaFreeHost(packedEven);
        cudaFreeHost(packedOdd);
      }
コード例 #4
0
ファイル: clover_quda.cpp プロジェクト: adenbley/quda
void loadParityClover(ParityClover ret, void *clover, QudaPrecision cpu_prec, 
		      CloverFieldOrder clover_order)
{
  // use pinned memory                                                                                           
  void *packedClover, *packedCloverNorm;

  if (ret.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
    errorQuda("Cannot have CUDA double precision without CPU double precision");
  }
  if (clover_order != QUDA_PACKED_CLOVER_ORDER) {
    errorQuda("Invalid clover_order");
  }

#ifndef __DEVICE_EMULATION__
  if (cudaMallocHost(&packedClover, ret.bytes) == cudaErrorMemoryAllocation) {
    errorQuda("Error allocating clover pinned memory");
  }  
  if (ret.precision == QUDA_HALF_PRECISION) 
    if (cudaMallocHost(&packedCloverNorm, ret.bytes/18) == cudaErrorMemoryAllocation) {
      errorQuda("Error allocating clover pinned memory");
    } 
#else
  packedClover = malloc(ret.bytes);
  if (ret.precision == QUDA_HALF_PRECISION) packedCloverNorm = malloc(ret.bytes/18);
#endif
    
  if (ret.precision == QUDA_DOUBLE_PRECISION) {
    packParityClover((double2 *)packedClover, (double *)clover, ret.volume, ret.pad);
  } else if (ret.precision == QUDA_SINGLE_PRECISION) {
    if (cpu_prec == QUDA_DOUBLE_PRECISION) {
      packParityClover((float4 *)packedClover, (double *)clover, ret.volume, ret.pad);
    } else {
      packParityClover((float4 *)packedClover, (float *)clover, ret.volume, ret.pad);
    }
  } else {
    if (cpu_prec == QUDA_DOUBLE_PRECISION) {
      packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, 
			   (double *)clover, ret.volume, ret.pad);
    } else {
      packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, 
			   (float *)clover, ret.volume, ret.pad);
    }
  }
  
  cudaMemcpy(ret.clover, packedClover, ret.bytes, cudaMemcpyHostToDevice);
  if (ret.precision == QUDA_HALF_PRECISION) {
    cudaMemcpy(ret.cloverNorm, packedCloverNorm, ret.bytes/18, cudaMemcpyHostToDevice);
  }

#ifndef __DEVICE_EMULATION__
  cudaFreeHost(packedClover);
  if (ret.precision == QUDA_HALF_PRECISION) cudaFreeHost(packedCloverNorm);
#else
  free(packedClover);
  if (ret.precision == QUDA_HALF_PRECISION) free(packedCloverNorm);
#endif

}
コード例 #5
0
ファイル: main.c プロジェクト: KKobuszewski/CudaGPE
void free_data_arr(DataArray* data_arr) {
    cudaFreeHost(*(data_arr->data_r));
    printf("host r space freed\n");
    cudaFreeHost(*(data_arr->data_k));
    printf("host k space freed\n");
//   cudaFree(*(data_arr->data_r_dev));
//   cudaDeviceSynchronize();
//   printf("device r space freed\n");
//   cudaFree(*(data_arr->data_k_dev));
//   cudaDeviceSynchronize();
//   printf("device k space freed\n");
}
コード例 #6
0
ファイル: clover_field.cpp プロジェクト: fwinter/quda
void cudaCloverField::loadFullField(void *even, void *evenNorm, void *odd, void *oddNorm, 
				    const void *h_clover, const QudaPrecision cpu_prec, 
				    const CloverFieldOrder cpu_order)
{
  // use pinned memory                  
  void *packedEven, *packedEvenNorm, *packedOdd, *packedOddNorm;

  if (precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
    errorQuda("Cannot have CUDA double precision without CPU double precision");
  }
  if (cpu_order != QUDA_LEX_PACKED_CLOVER_ORDER) {
    errorQuda("Invalid clover order");
  }

  cudaMallocHost(&packedEven, bytes/2);
  cudaMallocHost(&packedOdd, bytes/2);
  if (precision == QUDA_HALF_PRECISION) {
    cudaMallocHost(&packedEvenNorm, norm_bytes/2);
    cudaMallocHost(&packedOddNorm, norm_bytes/2);
  }
    
  if (precision == QUDA_DOUBLE_PRECISION) {
    packFullClover((double2 *)packedEven, (double2 *)packedOdd, (double *)clover, x, pad);
  } else if (precision == QUDA_SINGLE_PRECISION) {
    if (cpu_prec == QUDA_DOUBLE_PRECISION) {
      packFullClover((float4 *)packedEven, (float4 *)packedOdd, (double *)clover, x, pad);
    } else {
      packFullClover((float4 *)packedEven, (float4 *)packedOdd, (float *)clover, x, pad);    
    }
  } else {
    if (cpu_prec == QUDA_DOUBLE_PRECISION) {
      packFullCloverHalf((short4 *)packedEven, (float *)packedEvenNorm, (short4 *)packedOdd,
			 (float *) packedOddNorm, (double *)clover, x, pad);
    } else {
      packFullCloverHalf((short4 *)packedEven, (float *)packedEvenNorm, (short4 *)packedOdd,
			 (float * )packedOddNorm, (float *)clover, x, pad);    
    }
  }

  cudaMemcpy(even, packedEven, bytes/2, cudaMemcpyHostToDevice);
  cudaMemcpy(odd, packedOdd, bytes/2, cudaMemcpyHostToDevice);
  if (precision == QUDA_HALF_PRECISION) {
    cudaMemcpy(evenNorm, packedEvenNorm, norm_bytes/2, cudaMemcpyHostToDevice);
    cudaMemcpy(oddNorm, packedOddNorm, norm_bytes/2, cudaMemcpyHostToDevice);
  }

  cudaFreeHost(packedEven);
  cudaFreeHost(packedOdd);
  if (precision == QUDA_HALF_PRECISION) {
    cudaFreeHost(packedEvenNorm);
    cudaFreeHost(packedOddNorm);
  }
}
コード例 #7
0
void MFNHashTypePlainCUDA::freeThreadAndDeviceMemory() {
    trace_printf("MFNHashTypePlainCUDA::freeThreadAndDeviceMemory()\n");

    cudaError_t err;

    // Free all the memory, then look for errors.
    cudaFree((void *)this->DeviceHashlistAddress);
    cudaFreeHost((void *)this->HostSuccessAddress);

    delete[] this->HostSuccessReportedAddress;

    // Only cudaFree if zeroCopy is in use.
    if (!this->useZeroCopy) {
        cudaFree((void *)this->DeviceSuccessAddress);
        cudaFree((void *)this->DeviceFoundPasswordsAddress);

    }
    
    cudaFreeHost((void *)this->HostFoundPasswordsAddress);

    cudaFreeHost((void*)this->HostStartPointAddress);
    cudaFree((void *)this->DeviceStartPointAddress);
    cudaFree((void *)this->DeviceStartPasswords32Address);

    // Only free the bitmap memory if it has been allocated.
    if (this->DeviceBitmap128mb_a_Address) {
        cudaFree((void *)this->DeviceBitmap128mb_a_Address);
        this->DeviceBitmap128mb_a_Address = 0;
    }
    if (this->DeviceBitmap128mb_b_Address) {
        cudaFree((void *)this->DeviceBitmap128mb_b_Address);
        this->DeviceBitmap128mb_b_Address = 0;
    }
    if (this->DeviceBitmap128mb_c_Address) {
        cudaFree((void *)this->DeviceBitmap128mb_c_Address);
        this->DeviceBitmap128mb_c_Address = 0;
    }
    if (this->DeviceBitmap128mb_d_Address) {
        cudaFree((void *)this->DeviceBitmap128mb_d_Address);
        this->DeviceBitmap128mb_d_Address = 0;
    }

    // Get any error that occurred above and report it.
    err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("Thread %d: CUDA error freeing memory: %s. Exiting.\n",
                this->threadId, cudaGetErrorString( err));
        exit(1);
    }
}
コード例 #8
0
void GRTRegenerateChains::FreePerGPUMemory(GRTRegenerateThreadRunData *data) {
    CH_CUDA_SAFE_CALL(cudaFree(this->DEVICE_Hashes[data->threadID]));

    CH_CUDA_SAFE_CALL(cudaFreeHost(this->HOST_Success[data->threadID]));
    CH_CUDA_SAFE_CALL(cudaFreeHost(this->HOST_Passwords[data->threadID]));
    // Only free the device memory if zero copy was NOT used
    if (!this->CommandLineData->GetUseZeroCopy()) {
        CH_CUDA_SAFE_CALL(cudaFree(this->DEVICE_Passwords[data->threadID]));
        CH_CUDA_SAFE_CALL(cudaFree(this->DEVICE_Success[data->threadID]));
   }

    delete[] this->HOST_Success_Reported[data->threadID];
    //printf("Memory for thread %d freed.\n", data->threadID);
}
コード例 #9
0
OpenSteer::MemoryBackend::~MemoryBackend() {
    std::cout << "MemoryBackend reset" << std::endl;
    if (_data != 0) {
        cudaFreeHost(_data);
    }
    
    if (_const != 0) {
        cudaFreeHost(_const);
    }
    
    _data = 0;
    _const = 0;
    _instance = 0;
    _idCounter = 0;
}
コード例 #10
0
ファイル: cuda_gauge_field.cpp プロジェクト: mchengcit/quda
void loadMomField(Float2 *even, Float2 *odd, Float *mom, int bytes, int Vh, int pad) 
{  
  Float2 *packedEven, *packedOdd;
  cudaMallocHost(&packedEven, bytes/2); 
  cudaMallocHost(&packedOdd, bytes/2); 
    
  packMomField(packedEven, (Float*)mom, 0, Vh, pad);
  packMomField(packedOdd,  (Float*)mom, 1, Vh, pad);
    
  cudaMemcpy(even, packedEven, bytes/2, cudaMemcpyHostToDevice);
  cudaMemcpy(odd,  packedOdd, bytes/2, cudaMemcpyHostToDevice); 
  
  cudaFreeHost(packedEven);
  cudaFreeHost(packedOdd);
}
コード例 #11
0
    /**
     * destructor
     */
    virtual ~MappedBufferIntern()
    {
        __startOperation(ITask::TASK_CUDA);
        __startOperation(ITask::TASK_HOST);

        if (pointer && ownPointer)
        {
#if( PMACC_CUDA_ENABLED == 1 )
/* cupla 0.1.0 does not support the function cudaHostAlloc to create mapped memory.
 * Therefore we need to call the native CUDA function cudaFreeHost to free memory.
 * Due to the renaming of cuda functions with cupla via macros we need to remove
 * the renaming to get access to the native cuda function.
 * @todo this is a workaround please fix me. We need to investigate if
 * it is possible to have mapped/unified memory in alpaka.
 *
 * corresponding alpaka issues:
 *   https://github.com/ComputationalRadiationPhysics/alpaka/issues/296
 *   https://github.com/ComputationalRadiationPhysics/alpaka/issues/612
 */
#   undef cudaFreeHost
            CUDA_CHECK((cuplaError_t)cudaFreeHost(pointer));
// re-introduce the cupla macro
#   define cudaFreeHost(...) cuplaFreeHost(__VA_ARGS__)
#else
            __deleteArray(pointer);
#endif
        }
    }
コード例 #12
0
TEST_P(MemcpyAsync, H2DTransfers) {
    const size_t param = GetParam();
    const size_t alloc = 1 << param;

    cudaError_t ret;
    void *d1, *h1;
    ret = cudaMalloc(&d1, alloc);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaHostAlloc(&h1, alloc, cudaHostAllocMapped);
    ASSERT_EQ(cudaSuccess, ret);

    cudaStream_t stream;
    ret = cudaStreamCreate(&stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaMemcpyAsync(d1, h1, alloc, cudaMemcpyHostToDevice, stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamSynchronize(stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaFree(d1);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaFreeHost(h1);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamDestroy(stream);
    ASSERT_EQ(cudaSuccess, ret);
}
コード例 #13
0
ファイル: HostBufferIntern.hpp プロジェクト: Heikman/picongpu
 /**
  * destructor
  */
 virtual ~HostBufferIntern() throw (std::runtime_error)
 {
     if (pointer && ownPointer)
     {
         CUDA_CHECK(cudaFreeHost(pointer));
     }
 }
コード例 #14
0
ファイル: specific_cuda.hpp プロジェクト: psiha/nt2
      void allocate(std::size_t size_,std::size_t nstreams , nt2::host_ &)
      {
        if(size_ > size)
        {
          if(size != 0)
          {
            for(std::size_t i =0; i < device.size(); ++i)
            {
              CUDA_ERROR(cudaFreeHost(host_pinned[i]));
              CUDA_ERROR(cudaFree(device[i]));
            }
          }
          ns = nstreams;
          size = size_;
          std::size_t sizeof_ = size*sizeof(T);
          host_pinned.resize(nstreams);
          device.resize(nstreams);
          for(std::size_t i =0; i < nstreams; ++i)
          {
            CUDA_ERROR(cudaMallocHost( (void**)&host_pinned[i] , sizeof_ ));
            CUDA_ERROR(cudaMalloc((void**)&device[i] , sizeof_  ));

          }
        }
      }
コード例 #15
0
ファイル: util.c プロジェクト: DBorello/OpenSeesDev
/*! \brief Destroy distributed L & U matrices. */
void
Destroy_LU(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct)
{
    int_t i, nb, nsupers;
    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
    LocalLU_t *Llu = LUstruct->Llu;

#if ( DEBUGlevel>=1 )
    int iam;
    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
    CHECK_MALLOC(iam, "Enter Destroy_LU()");
#endif

    nsupers = Glu_persist->supno[n-1] + 1;

    nb = CEILING(nsupers, grid->npcol);
    for (i = 0; i < nb; ++i) 
	if ( Llu->Lrowind_bc_ptr[i] ) {
	    SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
#ifdef GPU_ACC
	    checkCuda(cudaFreeHost(Llu->Lnzval_bc_ptr[i]));
#else
	    SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
#endif
	}
    SUPERLU_FREE (Llu->Lrowind_bc_ptr);
    SUPERLU_FREE (Llu->Lnzval_bc_ptr);

    nb = CEILING(nsupers, grid->nprow);
    for (i = 0; i < nb; ++i)
	if ( Llu->Ufstnz_br_ptr[i] ) {
	    SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
	    SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
	}
    SUPERLU_FREE (Llu->Ufstnz_br_ptr);
    SUPERLU_FREE (Llu->Unzval_br_ptr);

    /* The following can be freed after factorization. */
    SUPERLU_FREE(Llu->ToRecv);
    SUPERLU_FREE(Llu->ToSendD);
    SUPERLU_FREE(Llu->ToSendR[0]);
    SUPERLU_FREE(Llu->ToSendR);

    /* The following can be freed only after iterative refinement. */
    SUPERLU_FREE(Llu->ilsum);
    SUPERLU_FREE(Llu->fmod);
    SUPERLU_FREE(Llu->fsendx_plist[0]);
    SUPERLU_FREE(Llu->fsendx_plist);
    SUPERLU_FREE(Llu->bmod);
    SUPERLU_FREE(Llu->bsendx_plist[0]);
    SUPERLU_FREE(Llu->bsendx_plist);
    SUPERLU_FREE(Llu->mod_bit);

    SUPERLU_FREE(Glu_persist->xsup);
    SUPERLU_FREE(Glu_persist->supno);

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Exit Destroy_LU()");
#endif
}
コード例 #16
0
ファイル: pinned_mem_pool.cpp プロジェクト: PickXu/pantry
pinned_mem_pool::~pinned_mem_pool()
{
        if (mem_) {
                cudaFreeHost(mem_);
                mem_ = NULL;
        }
}
コード例 #17
0
ファイル: pinned_mem_pool.cpp プロジェクト: PickXu/pantry
void pinned_mem_pool::destroy()
{
	if (mem_) {
		cudaFreeHost(mem_);
		mem_ = NULL;
	}
}
コード例 #18
0
ファイル: syncedmem.hpp プロジェクト: csuhawk/caffe
inline void CaffeFreeHost(void* ptr, bool use_cuda) {
#ifndef CPU_ONLY
  if (use_cuda) {
    CUDA_CHECK(cudaFreeHost(ptr));
    return;
  }
#endif

#ifdef USE_MLSL
  if (mn::is_multinode()) {
    mn::free(ptr);
  } else {
#endif /* !USE_MLSL */

#ifdef USE_MKL
    mkl_free(ptr);
#else
    free(ptr);
#endif

#ifdef USE_MLSL
  }
#endif /* USE_MLSL */

}
コード例 #19
0
TEST(HostAlloc, MappedPointer) {
    cudaError_t ret;
    int device;

    ret = cudaGetDevice(&device);
    ASSERT_EQ(cudaSuccess, ret);

    struct cudaDeviceProp prop;
    ret = cudaGetDeviceProperties(&prop, device);
    ASSERT_EQ(cudaSuccess, ret);

    void * ptr;
    ret = cudaHostAlloc(&ptr, 4, cudaHostAllocMapped);
    ASSERT_EQ(cudaSuccess, ret);

    /*
     * Try to retrieve the device pointer, expecting a result according to
     * prop.canMapHostMemory.
     */
    void * device_ptr;
    ret = cudaHostGetDevicePointer(&device_ptr, ptr, 0);
    if (prop.canMapHostMemory) {
        EXPECT_EQ(cudaSuccess, ret);
        EXPECT_FALSE(device_ptr == NULL);
    } else {
        EXPECT_EQ(cudaErrorMemoryAllocation, ret);
    }

    ret = cudaFreeHost(ptr);
    ASSERT_EQ(cudaSuccess, ret);
}
コード例 #20
0
/** Documented at declaration */
int
gpujpeg_image_destroy(uint8_t* image)
{
    cudaFreeHost(image);

    return 0;
}
コード例 #21
0
bool DumpIntegerMemoryDataSet(char *name, unsigned int* device_values, int nb_data ){

    //
    // ON ALLOUE LA ZONE MEMOIRE POUR RECUPERE LES DONNEES PROVENANT DU GPU
    //
    printf("(II) DumpFloatMemoryDataSet(%s, %p, %d)\n", name, device_values, nb_data);
    cudaError_t Status;
    unsigned int* host_values;
    CUDA_MALLOC_HOST(&host_values, nb_data, __FILE__, __LINE__);

    Status = cudaMemcpy(host_values, device_values, nb_data * sizeof(unsigned int), cudaMemcpyDeviceToHost);
    if(Status != cudaSuccess)
    {
    	printf("\n1 %s\n", cudaGetErrorString(Status));
    }

    PrintIntegerMatrix(name, host_values, nb_data, 8);
    // PrintIntegerMatrix(name, host_values, nb_data);
    Status = cudaFreeHost(host_values);
    if(Status != cudaSuccess)
    {
    	printf("\n1 %s\n", cudaGetErrorString(Status));
    }
    return true;
}
コード例 #22
0
ファイル: cuda_gauge_field.cpp プロジェクト: mchengcit/quda
void 
storeMomToCPUArray(Float* mom, Float2 *even, Float2 *odd, 
		   int bytes, int V, int pad) 
{    
  Float2 *packedEven, *packedOdd;   
  cudaMallocHost(&packedEven, bytes/2); 
  cudaMallocHost(&packedOdd, bytes/2); 
  cudaMemcpy(packedEven, even, bytes/2, cudaMemcpyDeviceToHost); 
  cudaMemcpy(packedOdd, odd, bytes/2, cudaMemcpyDeviceToHost);  
  
  unpackMomField((Float*)mom, packedEven,0, V/2, pad);
  unpackMomField((Float*)mom, packedOdd, 1, V/2, pad);
  
  cudaFreeHost(packedEven); 
  cudaFreeHost(packedOdd); 
}
コード例 #23
0
ファイル: cuda_gauge_field.cpp プロジェクト: mchengcit/quda
static void storeGaugeField(Float *cpuGauge, FloatN *gauge, GaugeFieldOrder cpu_order,
			    QudaReconstructType reconstruct, int bytes, int volumeCB, int pad) {

  // Use pinned memory
  FloatN *packed;
  cudaMallocHost(&packed, bytes);
  cudaMemcpy(packed, gauge, bytes, cudaMemcpyDeviceToHost);
    
  FloatN *packedEven = packed;
  FloatN *packedOdd = (FloatN*)((char*)packed + bytes/2);
    
  if (cpu_order == QUDA_QDP_GAUGE_ORDER) {
    unpackQDPGaugeField((Float**)cpuGauge, packedEven, 0, reconstruct, volumeCB, pad);
    unpackQDPGaugeField((Float**)cpuGauge, packedOdd, 1, reconstruct, volumeCB, pad);
  } else if (cpu_order == QUDA_CPS_WILSON_GAUGE_ORDER) {
    unpackCPSGaugeField((Float*)cpuGauge, packedEven, 0, reconstruct, volumeCB, pad);
    unpackCPSGaugeField((Float*)cpuGauge, packedOdd, 1, reconstruct, volumeCB, pad);
  } else if (cpu_order == QUDA_MILC_GAUGE_ORDER) {
    unpackMILCGaugeField((Float*)cpuGauge, packedEven, 0, reconstruct, volumeCB, pad);
    unpackMILCGaugeField((Float*)cpuGauge, packedOdd, 1, reconstruct, volumeCB, pad);
  } else {
    errorQuda("Invalid gauge_order");
  }
    
  cudaFreeHost(packed);
}
コード例 #24
0
ファイル: clover_field.cpp プロジェクト: urbach/quda
  void cudaCloverField::loadParityField(void *clover, void *cloverNorm, const void *h_clover, 
					const QudaPrecision cpu_prec, const CloverFieldOrder cpu_order)
  {
    // use pinned memory                                                                                           
    void *packedClover, *packedCloverNorm;

    if (precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
      errorQuda("Cannot have CUDA double precision without CPU double precision");
    }
    if (cpu_order != QUDA_PACKED_CLOVER_ORDER && cpu_order != QUDA_BQCD_CLOVER_ORDER) 
      errorQuda("Invalid clover order %d", cpu_order);

    if (cudaMallocHost(&packedClover, bytes/2) == cudaErrorMemoryAllocation)
      errorQuda("Error allocating clover pinned memory");

    if (precision == QUDA_HALF_PRECISION) {
      if (cudaMallocHost(&packedCloverNorm, norm_bytes/2) == cudaErrorMemoryAllocation)
	{
	  errorQuda("Error allocating clover pinned memory");
	} 
    }
    
    if (precision == QUDA_DOUBLE_PRECISION) {
      packParityClover((double2 *)packedClover, (double *)h_clover, volumeCB, pad, cpu_order);
    } else if (precision == QUDA_SINGLE_PRECISION) {
      if (cpu_prec == QUDA_DOUBLE_PRECISION) {
	packParityClover((float4 *)packedClover, (double *)h_clover, volumeCB, pad, cpu_order);
      } else {
	packParityClover((float4 *)packedClover, (float *)h_clover, volumeCB, pad, cpu_order);
      }
    } else {
      if (cpu_prec == QUDA_DOUBLE_PRECISION) {
	packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, 
			     (double *)h_clover, volumeCB, pad, cpu_order);
      } else {
	packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, 
			     (float *)h_clover, volumeCB, pad, cpu_order);
      }
    }
  
    cudaMemcpy(clover, packedClover, bytes/2, cudaMemcpyHostToDevice);
    if (precision == QUDA_HALF_PRECISION)
      cudaMemcpy(cloverNorm, packedCloverNorm, norm_bytes/2, cudaMemcpyHostToDevice);

    cudaFreeHost(packedClover);
    if (precision == QUDA_HALF_PRECISION) cudaFreeHost(packedCloverNorm);
  }
コード例 #25
0
ファイル: Mesh.cpp プロジェクト: cogwirrel/smooth
Mesh::~Mesh() {
  cudaFreeHost(coords_pinned);
  cudaFreeHost(ENList_pinned);
  cudaFreeHost(metric_pinned);
  cudaFreeHost(normals_pinned);
  cudaFreeHost(NNListArray_pinned);
  cudaFreeHost(NNListIndex_pinned);
  cudaFreeHost(NEListArray_pinned);
  cudaFreeHost(NEListIndex_pinned);
}
コード例 #26
0
ファイル: Utils.cpp プロジェクト: ashwinma/multicl
void aligned_free(void *ptr, const size_t sz)
{
#if 1
	//munlock(ptr, sz);
	free(ptr);
#else
	cudaFreeHost(&ptr);
#endif
}
コード例 #27
0
ファイル: gpuops.c プロジェクト: frankong/bart
void cuda_hostfree(void* ptr)
{
	struct cuda_mem_s* nptr = search(ptr, true);
	assert(nptr->ptr == ptr);
	assert(!nptr->device);
	free(nptr);

	cudaFreeHost(ptr);
}
コード例 #28
0
TEST(MemcpyAsync, Pinned) {
    /**
     * Host memory must be pinned in order to be used as an argument to
     * cudaMemcpyAsync.  Panoptes only prints a warning about this error
     * rather than actually return an error via the CUDA API.  This test is
     * written as to check for the absence of an error once the CUDA
     * implementation starts returning one for nonpinned host memory.
     */
    const long page_size_ = sysconf(_SC_PAGESIZE);
    ASSERT_LT(0, page_size_);
    const size_t page_size = page_size_;

    const size_t pages = 3;
    assert(pages > 0);

    cudaError_t ret;
    cudaStream_t stream;

    uint8_t *device_ptr, *host_ptr;
    ret = cudaMalloc((void **) &device_ptr, pages * page_size);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaMallocHost((void **) &host_ptr, pages * page_size);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamCreate(&stream);
    ASSERT_EQ(cudaSuccess, ret);

    /* Page aligned transfers */
    for (size_t i = 0; i < pages; i++) {
        for (size_t j = i; j < pages; j++) {
            ret = cudaMemcpyAsync(device_ptr, host_ptr + i * page_size,
                (pages - j) * page_size, cudaMemcpyHostToDevice, stream);
            EXPECT_EQ(cudaSuccess, ret);

            ret = cudaMemcpyAsync(host_ptr + i * page_size, device_ptr,
                (pages - j) * page_size, cudaMemcpyDeviceToHost, stream);
            EXPECT_EQ(cudaSuccess, ret);
        }
    }

    /* Try a nonaligned transfer. */
    ret = cudaMemcpyAsync(device_ptr, host_ptr + (page_size / 2),
        page_size / 2, cudaMemcpyHostToDevice, stream);

    ret = cudaStreamSynchronize(stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaStreamDestroy(stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaFreeHost(host_ptr);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaFree(device_ptr);
    ASSERT_EQ(cudaSuccess, ret);
}
コード例 #29
0
ファイル: voronoi.cpp プロジェクト: painnick/lbfgsb-on-gpu
void DestroySites()
{
	glDeleteBuffersARB(1, &vboId);
	glDeleteBuffersARB(1, &colorboId);

	delete[] site_list_x;
	delete[] site_list_x_bar;
	cudaFreeHost(site_list);
}
コード例 #30
0
   inline void deallocate(void* ptr, size_t)
   {
     cudaError_t error = cudaFreeHost(ptr);
 
     if(error != cudaSuccess)
     {
       throw thrust::system_error(error, thrust::cuda_category(), "pinned_resource::deallocate(): cudaFree");
     }
   }