void cpuColorSpinorField::copy(const cpuColorSpinorField &src) { checkField(*this, src); if (fieldOrder == src.fieldOrder && bytes == src.Bytes()) { if (fieldOrder == QUDA_QOP_DOMAIN_WALL_FIELD_ORDER) for (int i=0; i<x[nDim-1]; i++) memcpy(((void**)v)[i], ((void**)src.v)[i], bytes/x[nDim-1]); else memcpy(v, src.v, bytes); } else { copyGenericColorSpinor(*this, src, QUDA_CPU_FIELD_LOCATION); } }
void FaceBuffer::exchangeCpuSpinor(cpuColorSpinorField &spinor, int oddBit, int dagger) { //for all dimensions int len[4] = { nFace*faceVolumeCB[0]*Ninternal*precision, nFace*faceVolumeCB[1]*Ninternal*precision, nFace*faceVolumeCB[2]*Ninternal*precision, nFace*faceVolumeCB[3]*Ninternal*precision }; // allocate the ghost buffer if not yet allocated spinor.allocateGhostBuffer(); for(int i=0;i < 4; i++){ spinor.packGhost(spinor.backGhostFaceSendBuffer[i], i, QUDA_BACKWARDS, (QudaParity)oddBit, dagger); spinor.packGhost(spinor.fwdGhostFaceSendBuffer[i], i, QUDA_FORWARDS, (QudaParity)oddBit, dagger); } unsigned long recv_request1[4], recv_request2[4]; unsigned long send_request1[4], send_request2[4]; int back_nbr[4] = {X_BACK_NBR, Y_BACK_NBR, Z_BACK_NBR,T_BACK_NBR}; int fwd_nbr[4] = {X_FWD_NBR, Y_FWD_NBR, Z_FWD_NBR,T_FWD_NBR}; int uptags[4] = {XUP, YUP, ZUP, TUP}; int downtags[4] = {XDOWN, YDOWN, ZDOWN, TDOWN}; for(int i= 0;i < 4; i++){ recv_request1[i] = comm_recv_with_tag(spinor.backGhostFaceBuffer[i], len[i], back_nbr[i], uptags[i]); recv_request2[i] = comm_recv_with_tag(spinor.fwdGhostFaceBuffer[i], len[i], fwd_nbr[i], downtags[i]); send_request1[i]= comm_send_with_tag(spinor.fwdGhostFaceSendBuffer[i], len[i], fwd_nbr[i], uptags[i]); send_request2[i] = comm_send_with_tag(spinor.backGhostFaceSendBuffer[i], len[i], back_nbr[i], downtags[i]); } for(int i=0;i < 4;i++){ comm_wait(recv_request1[i]); comm_wait(recv_request2[i]); comm_wait(send_request1[i]); comm_wait(send_request2[i]); } }
void cudaColorSpinorField::saveCPUSpinorField(cpuColorSpinorField &dest) const { if (nDim != dest.Ndim()) { errorQuda("Number of dimensions %d %d don't match", nDim, dest.Ndim()); } if (volume != dest.volume) { errorQuda("Volumes %d %d don't match", volume, dest.volume); } if (SiteOrder() != dest.SiteOrder()) { errorQuda("Subset orders don't match"); } if (nColor != 3) { errorQuda("Nc != 3 not yet supported"); } if (siteSubset != dest.siteSubset) { errorQuda("Subset types do not match %d %d", siteSubset, dest.siteSubset); } if (precision == QUDA_HALF_PRECISION) { ColorSpinorParam param(*this); // acquire all attributes of this param.precision = QUDA_SINGLE_PRECISION; // change precision param.create = QUDA_COPY_FIELD_CREATE; cudaColorSpinorField tmp(*this, param); tmp.saveCPUSpinorField(dest); return; } // (temporary?) bug fix for padding memset(buffer, 0, bufferBytes); cudaMemcpy(buffer, v, bytes, cudaMemcpyDeviceToHost); #define SAVE_SPINOR_GPU_TO_CPU(myNs) \ if (precision == QUDA_DOUBLE_PRECISION) { \ if (dest.precision == QUDA_DOUBLE_PRECISION) { \ if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \ unpackSpinor<3,myNs,1>((double*)dest.v, (double*)buffer, volume, pad, x, dest.total_length, total_length, \ dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \ } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \ unpackSpinor<3,myNs,2>((double*)dest.v, (double*)buffer, volume, pad, x, dest.total_length, total_length, \ dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \ } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \ errorQuda("double4 not supported"); \ } \ } else { \ if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \ unpackSpinor<3,myNs,1>((float*)dest.v, (double*)buffer, volume, pad, x, dest.total_length, total_length, \ dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \ } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \ unpackSpinor<3,myNs,2>((float*)dest.v, (double*)buffer, volume, pad, x, dest.total_length, total_length, \ dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \ } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \ errorQuda("double4 not supported"); \ } \ } \ } else { \ if (dest.precision == QUDA_DOUBLE_PRECISION) { \ if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \ unpackSpinor<3,myNs,1>((double*)dest.v, (float*)buffer, volume, pad, x, dest.total_length, total_length, \ dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \ } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \ unpackSpinor<3,myNs,2>((double*)dest.v, (float*)buffer, volume, pad, x, dest.total_length, total_length, \ dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \ } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \ unpackSpinor<3,myNs,4>((double*)dest.v, (float*)buffer, volume, pad, x, dest.total_length, total_length, \ dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \ } \ } else { \ if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \ unpackSpinor<3,myNs,1>((float*)dest.v, (float*)buffer, volume, pad, x, dest.total_length, total_length, \ dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \ } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \ unpackSpinor<3,myNs,2>((float*)dest.v, (float*)buffer, volume, pad, x, dest.total_length, total_length, \ dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \ } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \ unpackSpinor<3,myNs,4>((float*)dest.v, (float*)buffer, volume, pad, x, dest.total_length, total_length, \ dest.SiteSubset(), dest.SiteOrder(), dest.GammaBasis(), gammaBasis, dest.FieldOrder()); \ } \ } \ } switch(nSpin){ case 1: SAVE_SPINOR_GPU_TO_CPU(1); break; case 4: SAVE_SPINOR_GPU_TO_CPU(4); break; default: errorQuda("invalid number of spinors in function %s\n", __FUNCTION__); } #undef SAVE_SPINOR_GPU_TO_CPU return; }
void cudaColorSpinorField::loadCPUSpinorField(const cpuColorSpinorField &src) { if (nDim != src.Ndim()) { errorQuda("Number of dimensions %d %d don't match", nDim, src.Ndim()); } if (volume != src.volume) { errorQuda("Volumes %d %d don't match", volume, src.volume); } if (SiteOrder() != src.SiteOrder()) { errorQuda("Subset orders don't match"); } if (nColor != 3) { errorQuda("Nc != 3 not yet supported"); } if (siteSubset != src.siteSubset) { errorQuda("Subset types do not match %d %d", siteSubset, src.siteSubset); } if (precision == QUDA_HALF_PRECISION) { ColorSpinorParam param(*this); // acquire all attributes of this param.precision = QUDA_SINGLE_PRECISION; // change precision param.create = QUDA_COPY_FIELD_CREATE; cudaColorSpinorField tmp(src, param); copy(tmp); return; } // (temporary?) bug fix for padding memset(buffer, 0, bufferBytes); #define LOAD_SPINOR_CPU_TO_GPU(myNs) \ if (precision == QUDA_DOUBLE_PRECISION) { \ if (src.precision == QUDA_DOUBLE_PRECISION) { \ if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \ packSpinor<3,myNs,1>((double*)buffer, (double*)src.v, volume, pad, x, total_length, src.total_length, \ src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \ } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \ packSpinor<3,myNs,2>((double*)buffer, (double*)src.v, volume, pad, x, total_length, src.total_length, \ src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \ } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \ errorQuda("double4 not supported"); \ } \ } else { \ if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \ packSpinor<3,myNs,1>((double*)buffer, (float*)src.v, volume, pad, x, total_length, src.total_length, \ src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \ } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \ packSpinor<3,myNs,2>((double*)buffer, (float*)src.v, volume, pad, x, total_length, src.total_length, \ src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \ } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \ errorQuda("double4 not supported"); \ } \ } \ } else { \ if (src.precision == QUDA_DOUBLE_PRECISION) { \ if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \ packSpinor<3,myNs,1>((float*)buffer, (double*)src.v, volume, pad, x, total_length, src.total_length, \ src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \ } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \ packSpinor<3,myNs,2>((float*)buffer, (double*)src.v, volume, pad, x, total_length, src.total_length, \ src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \ } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \ packSpinor<3,myNs,4>((float*)buffer, (double*)src.v, volume, pad, x, total_length, src.total_length, \ src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \ } \ } else { \ if (fieldOrder == QUDA_FLOAT_FIELD_ORDER) { \ packSpinor<3,myNs,1>((float*)buffer, (float*)src.v, volume, pad, x, total_length, src.total_length, \ src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \ } else if (fieldOrder == QUDA_FLOAT2_FIELD_ORDER) { \ packSpinor<3,myNs,2>((float*)buffer, (float*)src.v, volume, pad, x, total_length, src.total_length, \ src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \ } else if (fieldOrder == QUDA_FLOAT4_FIELD_ORDER) { \ packSpinor<3,myNs,4>((float*)buffer, (float*)src.v, volume, pad, x, total_length, src.total_length, \ src.SiteSubset(), src.SiteOrder(), gammaBasis, src.GammaBasis(), src.FieldOrder()); \ } \ } \ } switch(nSpin){ case 1: LOAD_SPINOR_CPU_TO_GPU(1); break; case 4: LOAD_SPINOR_CPU_TO_GPU(4); break; default: errorQuda("invalid number of spinors in function %s\n", __FUNCTION__); } #undef LOAD_SPINOR_CPU_TO_GPU /* for (int i=0; i<length; i++) { std::cout << i << " " << ((float*)src.v)[i] << " " << ((float*)buffer)[i] << std::endl; }*/ cudaMemcpy(v, buffer, bytes, cudaMemcpyHostToDevice); return; }
// This is just an initial hack for CPU comms - should be creating the message handlers at instantiation void FaceBuffer::exchangeCpuSpinor(cpuColorSpinorField &spinor, int oddBit, int dagger) { // allocate the ghost buffer if not yet allocated spinor.allocateGhostBuffer(); for(int i=0;i < 4; i++){ spinor.packGhost(spinor.backGhostFaceSendBuffer[i], i, QUDA_BACKWARDS, (QudaParity)oddBit, dagger); spinor.packGhost(spinor.fwdGhostFaceSendBuffer[i], i, QUDA_FORWARDS, (QudaParity)oddBit, dagger); } #ifdef QMP_COMMS QMP_msgmem_t mm_send_fwd[4]; QMP_msgmem_t mm_from_back[4]; QMP_msgmem_t mm_from_fwd[4]; QMP_msgmem_t mm_send_back[4]; QMP_msghandle_t mh_send_fwd[4]; QMP_msghandle_t mh_from_back[4]; QMP_msghandle_t mh_from_fwd[4]; QMP_msghandle_t mh_send_back[4]; for (int i=0; i<4; i++) { mm_send_fwd[i] = QMP_declare_msgmem(spinor.fwdGhostFaceSendBuffer[i], nbytes[i]); if( mm_send_fwd[i] == NULL ) errorQuda("Unable to allocate send fwd message mem"); mm_send_back[i] = QMP_declare_msgmem(spinor.backGhostFaceSendBuffer[i], nbytes[i]); if( mm_send_back == NULL ) errorQuda("Unable to allocate send back message mem"); mm_from_fwd[i] = QMP_declare_msgmem(spinor.fwdGhostFaceBuffer[i], nbytes[i]); if( mm_from_fwd[i] == NULL ) errorQuda("Unable to allocate recv from fwd message mem"); mm_from_back[i] = QMP_declare_msgmem(spinor.backGhostFaceBuffer[i], nbytes[i]); if( mm_from_back[i] == NULL ) errorQuda("Unable to allocate recv from back message mem"); mh_send_fwd[i] = QMP_declare_send_relative(mm_send_fwd[i], i, +1, 0); if( mh_send_fwd[i] == NULL ) errorQuda("Unable to allocate forward send"); mh_send_back[i] = QMP_declare_send_relative(mm_send_back[i], i, -1, 0); if( mh_send_back[i] == NULL ) errorQuda("Unable to allocate backward send"); mh_from_fwd[i] = QMP_declare_receive_relative(mm_from_fwd[i], i, +1, 0); if( mh_from_fwd[i] == NULL ) errorQuda("Unable to allocate forward recv"); mh_from_back[i] = QMP_declare_receive_relative(mm_from_back[i], i, -1, 0); if( mh_from_back[i] == NULL ) errorQuda("Unable to allocate backward recv"); } for (int i=0; i<4; i++) { QMP_start(mh_from_back[i]); QMP_start(mh_from_fwd[i]); QMP_start(mh_send_fwd[i]); QMP_start(mh_send_back[i]); } for (int i=0; i<4; i++) { QMP_wait(mh_send_fwd[i]); QMP_wait(mh_send_back[i]); QMP_wait(mh_from_back[i]); QMP_wait(mh_from_fwd[i]); } for (int i=0; i<4; i++) { QMP_free_msghandle(mh_send_fwd[i]); QMP_free_msghandle(mh_send_back[i]); QMP_free_msghandle(mh_from_fwd[i]); QMP_free_msghandle(mh_from_back[i]); QMP_free_msgmem(mm_send_fwd[i]); QMP_free_msgmem(mm_send_back[i]); QMP_free_msgmem(mm_from_back[i]); QMP_free_msgmem(mm_from_fwd[i]); } #else for (int i=0; i<4; i++) { //printf("%d COPY length = %d\n", i, nbytes[i]/precision); memcpy(spinor.fwdGhostFaceBuffer[i], spinor.backGhostFaceSendBuffer[i], nbytes[i]); memcpy(spinor.backGhostFaceBuffer[i], spinor.fwdGhostFaceSendBuffer[i], nbytes[i]); } #endif }