Exemplo n.º 1
0
  static void do_storeLinkToCPU(Float* cpuGauge, FloatN *even, FloatN *odd, 
				int bytes, int Vh, int stride, QudaPrecision prec) 
  {  
    int datalen = 4*Vh*gaugeSiteSize*sizeof(Float);

    double *unpackedDataEven = (double *) device_malloc(datalen);
    double *unpackedDataOdd = unpackedDataEven;

    //unpack even data kernel
    link_format_gpu_to_cpu((void*)unpackedDataEven, (void*)even, Vh, stride, prec, streams[0]);

#ifdef GPU_DIRECT 
    cudaMemcpyAsync(cpuGauge, unpackedDataEven, datalen, cudaMemcpyDeviceToHost, streams[0]);
#else
    cudaMemcpy(cpuGauge, unpackedDataEven, datalen, cudaMemcpyDeviceToHost);
#endif
  
    //unpack odd data kernel
    link_format_gpu_to_cpu((void*)unpackedDataOdd, (void*)odd, Vh, stride, prec, streams[0]);
#ifdef GPU_DIRECT 
    cudaMemcpyAsync(cpuGauge + 4*Vh*gaugeSiteSize, unpackedDataOdd, datalen, cudaMemcpyDeviceToHost, streams[0]);  
#else
    cudaMemcpy(cpuGauge + 4*Vh*gaugeSiteSize, unpackedDataOdd, datalen, cudaMemcpyDeviceToHost);  
#endif
  
    device_free(unpackedDataEven);
  }
Exemplo n.º 2
0
static void storeGaugeField(Float* cpuGauge, FloatN *gauge, int bytes, int volumeCB, 
			    int stride, QudaPrecision prec) 
{  
  cudaStream_t streams[2];
  for (int i=0; i<2; i++) cudaStreamCreate(&streams[i]);
  
  FloatN *even = gauge;
  FloatN *odd = (FloatN*)((char*)gauge + bytes/2);

  int datalen = 4*2*volumeCB*gaugeSiteSize*sizeof(Float); // both parities
  void *unpacked;  
  if(cudaMalloc(&unpacked, datalen) != cudaSuccess){
    errorQuda("cudaMalloc() failed for unpacked\n");
  }
  void *unpackedEven = unpacked;
  void *unpackedOdd = (char*)unpacked + datalen/2;
  
  //unpack even data kernel
  link_format_gpu_to_cpu((void*)unpackedEven, (void*)even, volumeCB, stride, prec, streams[0]);
#ifdef GPU_DIRECT
  cudaMemcpyAsync(cpuGauge, unpackedEven, datalen/2, cudaMemcpyDeviceToHost, streams[0]);
#else
  cudaMemcpy(cpuGauge, unpackedEven, datalen/2, cudaMemcpyDeviceToHost);
#endif
  
  //unpack odd data kernel
  link_format_gpu_to_cpu((void*)unpackedOdd, (void*)odd, volumeCB, stride, prec, streams[1]);
#ifdef GPU_DIRECT
  cudaMemcpyAsync(cpuGauge + 4*volumeCB*gaugeSiteSize, unpackedOdd, datalen/2, cudaMemcpyDeviceToHost, streams[1]);  
  for(int i=0; i<2; i++) cudaStreamSynchronize(streams[i]);
#else
  cudaMemcpy(cpuGauge + 4*volumeCB*gaugeSiteSize, unpackedOdd, datalen/2, cudaMemcpyDeviceToHost);  
#endif
  
  cudaFree(unpacked);
  for(int i=0; i<2; i++) cudaStreamDestroy(streams[i]);
}