void loadParityClover(ParityClover ret, void *clover, QudaPrecision cpu_prec, CloverFieldOrder clover_order) { // use pinned memory void *packedClover, *packedCloverNorm; if (ret.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) { errorQuda("Cannot have CUDA double precision without CPU double precision"); } if (clover_order != QUDA_PACKED_CLOVER_ORDER) { errorQuda("Invalid clover_order"); } #ifndef __DEVICE_EMULATION__ if (cudaMallocHost(&packedClover, ret.bytes) == cudaErrorMemoryAllocation) { errorQuda("Error allocating clover pinned memory"); } if (ret.precision == QUDA_HALF_PRECISION) if (cudaMallocHost(&packedCloverNorm, ret.bytes/18) == cudaErrorMemoryAllocation) { errorQuda("Error allocating clover pinned memory"); } #else packedClover = malloc(ret.bytes); if (ret.precision == QUDA_HALF_PRECISION) packedCloverNorm = malloc(ret.bytes/18); #endif if (ret.precision == QUDA_DOUBLE_PRECISION) { packParityClover((double2 *)packedClover, (double *)clover, ret.volume, ret.pad); } else if (ret.precision == QUDA_SINGLE_PRECISION) { if (cpu_prec == QUDA_DOUBLE_PRECISION) { packParityClover((float4 *)packedClover, (double *)clover, ret.volume, ret.pad); } else { packParityClover((float4 *)packedClover, (float *)clover, ret.volume, ret.pad); } } else { if (cpu_prec == QUDA_DOUBLE_PRECISION) { packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, (double *)clover, ret.volume, ret.pad); } else { packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, (float *)clover, ret.volume, ret.pad); } } cudaMemcpy(ret.clover, packedClover, ret.bytes, cudaMemcpyHostToDevice); if (ret.precision == QUDA_HALF_PRECISION) { cudaMemcpy(ret.cloverNorm, packedCloverNorm, ret.bytes/18, cudaMemcpyHostToDevice); } #ifndef __DEVICE_EMULATION__ cudaFreeHost(packedClover); if (ret.precision == QUDA_HALF_PRECISION) cudaFreeHost(packedCloverNorm); #else free(packedClover); if (ret.precision == QUDA_HALF_PRECISION) free(packedCloverNorm); #endif }
void cudaCloverField::loadParityField(void *clover, void *cloverNorm, const void *h_clover, const QudaPrecision cpu_prec, const CloverFieldOrder cpu_order) { // use pinned memory void *packedClover, *packedCloverNorm; if (precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) { errorQuda("Cannot have CUDA double precision without CPU double precision"); } if (cpu_order != QUDA_PACKED_CLOVER_ORDER && cpu_order != QUDA_BQCD_CLOVER_ORDER) errorQuda("Invalid clover order %d", cpu_order); if (cudaMallocHost(&packedClover, bytes/2) == cudaErrorMemoryAllocation) errorQuda("Error allocating clover pinned memory"); if (precision == QUDA_HALF_PRECISION) { if (cudaMallocHost(&packedCloverNorm, norm_bytes/2) == cudaErrorMemoryAllocation) { errorQuda("Error allocating clover pinned memory"); } } if (precision == QUDA_DOUBLE_PRECISION) { packParityClover((double2 *)packedClover, (double *)h_clover, volumeCB, pad, cpu_order); } else if (precision == QUDA_SINGLE_PRECISION) { if (cpu_prec == QUDA_DOUBLE_PRECISION) { packParityClover((float4 *)packedClover, (double *)h_clover, volumeCB, pad, cpu_order); } else { packParityClover((float4 *)packedClover, (float *)h_clover, volumeCB, pad, cpu_order); } } else { if (cpu_prec == QUDA_DOUBLE_PRECISION) { packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, (double *)h_clover, volumeCB, pad, cpu_order); } else { packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, (float *)h_clover, volumeCB, pad, cpu_order); } } cudaMemcpy(clover, packedClover, bytes/2, cudaMemcpyHostToDevice); if (precision == QUDA_HALF_PRECISION) cudaMemcpy(cloverNorm, packedCloverNorm, norm_bytes/2, cudaMemcpyHostToDevice); cudaFreeHost(packedClover); if (precision == QUDA_HALF_PRECISION) cudaFreeHost(packedCloverNorm); }
void cudaCloverField::loadParityField(void *clover, void *cloverNorm, const void *h_clover, const QudaPrecision cpu_prec, const CloverFieldOrder cpu_order) { // use pinned memory void *packedClover, *packedCloverNorm=0; if (precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) { errorQuda("Cannot have CUDA double precision without CPU double precision"); } if (cpu_order != QUDA_PACKED_CLOVER_ORDER && cpu_order != QUDA_BQCD_CLOVER_ORDER) errorQuda("Invalid clover order %d", cpu_order); resizeBuffer(bytes/2 + norm_bytes/2); packedClover = bufferPinned; if (precision == QUDA_HALF_PRECISION) packedCloverNorm = (char*)bufferPinned + bytes/2; if (precision == QUDA_DOUBLE_PRECISION) { packParityClover((double2 *)packedClover, (double *)h_clover, volumeCB, pad, cpu_order); } else if (precision == QUDA_SINGLE_PRECISION) { if (cpu_prec == QUDA_DOUBLE_PRECISION) { packParityClover((float4 *)packedClover, (double *)h_clover, volumeCB, pad, cpu_order); } else { packParityClover((float4 *)packedClover, (float *)h_clover, volumeCB, pad, cpu_order); } } else { if (cpu_prec == QUDA_DOUBLE_PRECISION) { packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, (double *)h_clover, volumeCB, pad, cpu_order); } else { packParityCloverHalf((short4 *)packedClover, (float *)packedCloverNorm, (float *)h_clover, volumeCB, pad, cpu_order); } } cudaMemcpy(clover, packedClover, bytes/2, cudaMemcpyHostToDevice); if (precision == QUDA_HALF_PRECISION) cudaMemcpy(cloverNorm, packedCloverNorm, norm_bytes/2, cudaMemcpyHostToDevice); }