示例#1
0
void cudaCloverField::loadFullField(void *even, void *evenNorm, void *odd, void *oddNorm, 
				    const void *h_clover, const QudaPrecision cpu_prec, 
				    const CloverFieldOrder cpu_order)
{
  // use pinned memory                  
  void *packedEven, *packedEvenNorm, *packedOdd, *packedOddNorm;

  if (precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
    errorQuda("Cannot have CUDA double precision without CPU double precision");
  }
  if (cpu_order != QUDA_LEX_PACKED_CLOVER_ORDER) {
    errorQuda("Invalid clover order");
  }

  cudaMallocHost(&packedEven, bytes/2);
  cudaMallocHost(&packedOdd, bytes/2);
  if (precision == QUDA_HALF_PRECISION) {
    cudaMallocHost(&packedEvenNorm, norm_bytes/2);
    cudaMallocHost(&packedOddNorm, norm_bytes/2);
  }
    
  if (precision == QUDA_DOUBLE_PRECISION) {
    packFullClover((double2 *)packedEven, (double2 *)packedOdd, (double *)clover, x, pad);
  } else if (precision == QUDA_SINGLE_PRECISION) {
    if (cpu_prec == QUDA_DOUBLE_PRECISION) {
      packFullClover((float4 *)packedEven, (float4 *)packedOdd, (double *)clover, x, pad);
    } else {
      packFullClover((float4 *)packedEven, (float4 *)packedOdd, (float *)clover, x, pad);    
    }
  } else {
    if (cpu_prec == QUDA_DOUBLE_PRECISION) {
      packFullCloverHalf((short4 *)packedEven, (float *)packedEvenNorm, (short4 *)packedOdd,
			 (float *) packedOddNorm, (double *)clover, x, pad);
    } else {
      packFullCloverHalf((short4 *)packedEven, (float *)packedEvenNorm, (short4 *)packedOdd,
			 (float * )packedOddNorm, (float *)clover, x, pad);    
    }
  }

  cudaMemcpy(even, packedEven, bytes/2, cudaMemcpyHostToDevice);
  cudaMemcpy(odd, packedOdd, bytes/2, cudaMemcpyHostToDevice);
  if (precision == QUDA_HALF_PRECISION) {
    cudaMemcpy(evenNorm, packedEvenNorm, norm_bytes/2, cudaMemcpyHostToDevice);
    cudaMemcpy(oddNorm, packedOddNorm, norm_bytes/2, cudaMemcpyHostToDevice);
  }

  cudaFreeHost(packedEven);
  cudaFreeHost(packedOdd);
  if (precision == QUDA_HALF_PRECISION) {
    cudaFreeHost(packedEvenNorm);
    cudaFreeHost(packedOddNorm);
  }
}
示例#2
0
void loadFullClover(FullClover ret, void *clover, QudaPrecision cpu_prec,
		    CloverFieldOrder clover_order)
{
  // use pinned memory                                                                                           
  void *packedEven, *packedEvenNorm, *packedOdd, *packedOddNorm;

  if (ret.even.precision == QUDA_DOUBLE_PRECISION && cpu_prec != QUDA_DOUBLE_PRECISION) {
    errorQuda("Cannot have CUDA double precision without CPU double precision");
  }
  if (clover_order != QUDA_LEX_PACKED_CLOVER_ORDER) {
    errorQuda("Invalid clover order");
  }

#ifndef __DEVICE_EMULATION__
  cudaMallocHost(&packedEven, ret.even.bytes);
  cudaMallocHost(&packedOdd, ret.even.bytes);
  if (ret.even.precision == QUDA_HALF_PRECISION) {
    cudaMallocHost(&packedEvenNorm, ret.even.bytes/18);
    cudaMallocHost(&packedOddNorm, ret.even.bytes/18);
  }
#else
  packedEven = malloc(ret.even.bytes);
  packedOdd = malloc(ret.even.bytes);
  if (ret.even.precision == QUDA_HALF_PRECISION) {
    packedEvenNorm = malloc(ret.even.bytes/18);
    packedOddNorm = malloc(ret.even.bytes/18);
  }
#endif
    
  if (ret.even.precision == QUDA_DOUBLE_PRECISION) {
    packFullClover((double2 *)packedEven, (double2 *)packedOdd, (double *)clover, ret.even.X, ret.even.pad);
  } else if (ret.even.precision == QUDA_SINGLE_PRECISION) {
    if (cpu_prec == QUDA_DOUBLE_PRECISION) {
      packFullClover((float4 *)packedEven, (float4 *)packedOdd, (double *)clover, ret.even.X, ret.even.pad);
    } else {
      packFullClover((float4 *)packedEven, (float4 *)packedOdd, (float *)clover, ret.even.X, ret.even.pad);    
    }
  } else {
    if (cpu_prec == QUDA_DOUBLE_PRECISION) {
      packFullCloverHalf((short4 *)packedEven, (float *)packedEvenNorm, (short4 *)packedOdd,
			 (float *) packedOddNorm, (double *)clover, ret.even.X, ret.even.pad);
    } else {
      packFullCloverHalf((short4 *)packedEven, (float *)packedEvenNorm, (short4 *)packedOdd,
			 (float * )packedOddNorm, (float *)clover, ret.even.X, ret.even.pad);    
    }
  }

  cudaMemcpy(ret.even.clover, packedEven, ret.even.bytes, cudaMemcpyHostToDevice);
  cudaMemcpy(ret.odd.clover, packedOdd, ret.even.bytes, cudaMemcpyHostToDevice);
  if (ret.even.precision == QUDA_HALF_PRECISION) {
    cudaMemcpy(ret.even.cloverNorm, packedEvenNorm, ret.even.bytes/18, cudaMemcpyHostToDevice);
    cudaMemcpy(ret.odd.cloverNorm, packedOddNorm, ret.even.bytes/18, cudaMemcpyHostToDevice);
  }

#ifndef __DEVICE_EMULATION__
  cudaFreeHost(packedEven);
  cudaFreeHost(packedOdd);
  if (ret.even.precision == QUDA_HALF_PRECISION) {
    cudaFreeHost(packedEvenNorm);
    cudaFreeHost(packedOddNorm);
  }
#else
  free(packedEven);
  free(packedOdd);
  if (ret.even.precision == QUDA_HALF_PRECISION) {
    free(packedEvenNorm);
    free(packedOddNorm);
  }
#endif

}