void cudaCloverField::loadFullField(void *even, void *evenNorm, void *odd, void *oddNorm, const void *h_clover, const QudaPrecision cpu_prec, const CloverFieldOrder cpu_order) { // use pinned memory void *packedEven, *packedEvenNorm, *packedOdd, *packedOddNorm; if (precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) { errorQuda("Cannot have CUDA double precision without CPU double precision"); } if (cpu_order != QUDA_LEX_PACKED_CLOVER_ORDER) { errorQuda("Invalid clover order"); } cudaMallocHost(&packedEven, bytes/2); cudaMallocHost(&packedOdd, bytes/2); if (precision == QUDA_HALF_PRECISION) { cudaMallocHost(&packedEvenNorm, norm_bytes/2); cudaMallocHost(&packedOddNorm, norm_bytes/2); } if (precision == QUDA_DOUBLE_PRECISION) { packFullClover((double2 *)packedEven, (double2 *)packedOdd, (double *)clover, x, pad); } else if (precision == QUDA_SINGLE_PRECISION) { if (cpu_prec == QUDA_DOUBLE_PRECISION) { packFullClover((float4 *)packedEven, (float4 *)packedOdd, (double *)clover, x, pad); } else { packFullClover((float4 *)packedEven, (float4 *)packedOdd, (float *)clover, x, pad); } } else { if (cpu_prec == QUDA_DOUBLE_PRECISION) { packFullCloverHalf((short4 *)packedEven, (float *)packedEvenNorm, (short4 *)packedOdd, (float *) packedOddNorm, (double *)clover, x, pad); } else { packFullCloverHalf((short4 *)packedEven, (float *)packedEvenNorm, (short4 *)packedOdd, (float * )packedOddNorm, (float *)clover, x, pad); } } cudaMemcpy(even, packedEven, bytes/2, cudaMemcpyHostToDevice); cudaMemcpy(odd, packedOdd, bytes/2, cudaMemcpyHostToDevice); if (precision == QUDA_HALF_PRECISION) { cudaMemcpy(evenNorm, packedEvenNorm, norm_bytes/2, cudaMemcpyHostToDevice); cudaMemcpy(oddNorm, packedOddNorm, norm_bytes/2, cudaMemcpyHostToDevice); } cudaFreeHost(packedEven); cudaFreeHost(packedOdd); if (precision == QUDA_HALF_PRECISION) { cudaFreeHost(packedEvenNorm); cudaFreeHost(packedOddNorm); } }
void loadFullClover(FullClover ret, void *clover, QudaPrecision cpu_prec, CloverFieldOrder clover_order) { // use pinned memory void *packedEven, *packedEvenNorm, *packedOdd, *packedOddNorm; if (ret.even.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) { errorQuda("Cannot have CUDA double precision without CPU double precision"); } if (clover_order != QUDA_LEX_PACKED_CLOVER_ORDER) { errorQuda("Invalid clover order"); } #ifndef __DEVICE_EMULATION__ cudaMallocHost(&packedEven, ret.even.bytes); cudaMallocHost(&packedOdd, ret.even.bytes); if (ret.even.precision == QUDA_HALF_PRECISION) { cudaMallocHost(&packedEvenNorm, ret.even.bytes/18); cudaMallocHost(&packedOddNorm, ret.even.bytes/18); } #else packedEven = malloc(ret.even.bytes); packedOdd = malloc(ret.even.bytes); if (ret.even.precision == QUDA_HALF_PRECISION) { packedEvenNorm = malloc(ret.even.bytes/18); packedOddNorm = malloc(ret.even.bytes/18); } #endif if (ret.even.precision == QUDA_DOUBLE_PRECISION) { packFullClover((double2 *)packedEven, (double2 *)packedOdd, (double *)clover, ret.even.X, ret.even.pad); } else if (ret.even.precision == QUDA_SINGLE_PRECISION) { if (cpu_prec == QUDA_DOUBLE_PRECISION) { packFullClover((float4 *)packedEven, (float4 *)packedOdd, (double *)clover, ret.even.X, ret.even.pad); } else { packFullClover((float4 *)packedEven, (float4 *)packedOdd, (float *)clover, ret.even.X, ret.even.pad); } } else { if (cpu_prec == QUDA_DOUBLE_PRECISION) { packFullCloverHalf((short4 *)packedEven, (float *)packedEvenNorm, (short4 *)packedOdd, (float *) packedOddNorm, (double *)clover, ret.even.X, ret.even.pad); } else { packFullCloverHalf((short4 *)packedEven, (float *)packedEvenNorm, (short4 *)packedOdd, (float * )packedOddNorm, (float *)clover, ret.even.X, ret.even.pad); } } cudaMemcpy(ret.even.clover, packedEven, ret.even.bytes, cudaMemcpyHostToDevice); cudaMemcpy(ret.odd.clover, packedOdd, ret.even.bytes, cudaMemcpyHostToDevice); if (ret.even.precision == QUDA_HALF_PRECISION) { cudaMemcpy(ret.even.cloverNorm, packedEvenNorm, ret.even.bytes/18, cudaMemcpyHostToDevice); cudaMemcpy(ret.odd.cloverNorm, packedOddNorm, ret.even.bytes/18, cudaMemcpyHostToDevice); } #ifndef __DEVICE_EMULATION__ cudaFreeHost(packedEven); cudaFreeHost(packedOdd); if (ret.even.precision == QUDA_HALF_PRECISION) { cudaFreeHost(packedEvenNorm); cudaFreeHost(packedOddNorm); } #else free(packedEven); free(packedOdd); if (ret.even.precision == QUDA_HALF_PRECISION) { free(packedEvenNorm); free(packedOddNorm); } #endif }