static void do_storeLinkToCPU(Float* cpuGauge, FloatN *even, FloatN *odd, int bytes, int Vh, int stride, QudaPrecision prec) { int datalen = 4*Vh*gaugeSiteSize*sizeof(Float); double *unpackedDataEven = (double *) device_malloc(datalen); double *unpackedDataOdd = unpackedDataEven; //unpack even data kernel link_format_gpu_to_cpu((void*)unpackedDataEven, (void*)even, Vh, stride, prec, streams[0]); #ifdef GPU_DIRECT cudaMemcpyAsync(cpuGauge, unpackedDataEven, datalen, cudaMemcpyDeviceToHost, streams[0]); #else cudaMemcpy(cpuGauge, unpackedDataEven, datalen, cudaMemcpyDeviceToHost); #endif //unpack odd data kernel link_format_gpu_to_cpu((void*)unpackedDataOdd, (void*)odd, Vh, stride, prec, streams[0]); #ifdef GPU_DIRECT cudaMemcpyAsync(cpuGauge + 4*Vh*gaugeSiteSize, unpackedDataOdd, datalen, cudaMemcpyDeviceToHost, streams[0]); #else cudaMemcpy(cpuGauge + 4*Vh*gaugeSiteSize, unpackedDataOdd, datalen, cudaMemcpyDeviceToHost); #endif device_free(unpackedDataEven); }
static void storeGaugeField(Float* cpuGauge, FloatN *gauge, int bytes, int volumeCB, int stride, QudaPrecision prec) { cudaStream_t streams[2]; for (int i=0; i<2; i++) cudaStreamCreate(&streams[i]); FloatN *even = gauge; FloatN *odd = (FloatN*)((char*)gauge + bytes/2); int datalen = 4*2*volumeCB*gaugeSiteSize*sizeof(Float); // both parities void *unpacked; if(cudaMalloc(&unpacked, datalen) != cudaSuccess){ errorQuda("cudaMalloc() failed for unpacked\n"); } void *unpackedEven = unpacked; void *unpackedOdd = (char*)unpacked + datalen/2; //unpack even data kernel link_format_gpu_to_cpu((void*)unpackedEven, (void*)even, volumeCB, stride, prec, streams[0]); #ifdef GPU_DIRECT cudaMemcpyAsync(cpuGauge, unpackedEven, datalen/2, cudaMemcpyDeviceToHost, streams[0]); #else cudaMemcpy(cpuGauge, unpackedEven, datalen/2, cudaMemcpyDeviceToHost); #endif //unpack odd data kernel link_format_gpu_to_cpu((void*)unpackedOdd, (void*)odd, volumeCB, stride, prec, streams[1]); #ifdef GPU_DIRECT cudaMemcpyAsync(cpuGauge + 4*volumeCB*gaugeSiteSize, unpackedOdd, datalen/2, cudaMemcpyDeviceToHost, streams[1]); for(int i=0; i<2; i++) cudaStreamSynchronize(streams[i]); #else cudaMemcpy(cpuGauge + 4*volumeCB*gaugeSiteSize, unpackedOdd, datalen/2, cudaMemcpyDeviceToHost); #endif cudaFree(unpacked); for(int i=0; i<2; i++) cudaStreamDestroy(streams[i]); }