void ConvertToMKL(unsigned long long tensor_) { long long * tensor = (long long *)tensor_; if (tensor[CPUPtr] == 0 ) { printf("error to conver to MKL tensor!\n"); return; } if (tensor[MKLLayout] == 0) return;//do not need convert if (!dnnLayoutCompare_F32((dnnLayout_t)tensor[MKLLayout], (dnnLayout_t)tensor[CPULayout])) { dnnError_t err; dnnPrimitive_t cv; CHECK_ERR( dnnConversionCreate_F32(&cv, (dnnLayout_t)tensor[CPULayout], (dnnLayout_t)tensor[MKLLayout]), err ); CHECK_ERR( dnnConversionExecute_F32(cv, (float *)tensor[CPUPtr], (float *)tensor[MKLPtr]), err ); } else { memcpy((void*)tensor[MKLPtr], (void*)tensor[CPUPtr], dnnLayoutGetMemorySize_F32((dnnLayout_t)tensor[MKLLayout])); } ERR_RETURN: return; }
//convert tensor in MKL layout back to Numpy NCHW layout //if layout diff, do conversion, else, copy memory directly void ConvertBack(unsigned long long tensor_, int N, int C, int H, int W) { long long * tensor = (long long *)tensor_; if (tensor[CPUPtr] == 0 ) { printf("error to converback tensor!\n"); return; } if (tensor[MKLLayout] == 0) return;//do not need convert dnnError_t err; size_t inSize[DIM4] = { W, H, C, N}; size_t inStride[DIM4] = { 1, W, W*H, W*H*C}; dnnLayout_t lt_NCHW = NULL, lt_CHWN = NULL; float* newPtr = NULL; CHECK_ERR( dnnLayoutCreate_F32(<_NCHW, DIM4, inSize, inStride), err ); if (!dnnLayoutCompare_F32((dnnLayout_t)tensor[MKLLayout], (dnnLayout_t)tensor[CPULayout])) { float* cpuPtr = (float *)tensor[CPUPtr]; float* mklPtr = (float *)tensor[MKLPtr]; if (!dnnLayoutCompare_F32((dnnLayout_t)tensor[MKLLayout], lt_NCHW)) { dnnPrimitive_t cv; CHECK_ERR( dnnConversionCreate_F32(&cv, (dnnLayout_t)tensor[MKLLayout],lt_NCHW), err ); newPtr = (float*)malloc(N*C*H*W*sizeof(float)); CHECK_ERR( dnnConversionExecute_F32(cv, mklPtr, newPtr), err ); mklPtr = newPtr; } mkl_somatcopy('r', 't', N, C*H*W, 1.0, mklPtr, C*H*W, cpuPtr, N); } else { long long grad_in_len = (long long)dnnLayoutGetMemorySize_F32((dnnLayout_t)tensor[MKLLayout]) ; float * destPtr = (float*)tensor[CPUPtr]; float * srcPtr = (float*)tensor[MKLPtr]; #pragma omp parallel for for (long long i = 0; i < grad_in_len/4; ++i) { destPtr[i] = srcPtr[i]; } } ERR_RETURN: if (newPtr!=NULL) { free(newPtr); } }
void MaxPooling_bprop( unsigned long long gradOutput, //input, N*outC*outH*outW unsigned long long gradInput, //output result unsigned long long dnnprimitives, int initOK, const float beta) { dnnError_t err; long long* primitives = (long long*)dnnprimitives; if (initOK == 0) { Init_b((long long *)gradInput, (long long *)gradOutput, primitives); } //get resource float* resPool[dnnResourceNumber] = {0}; float* OutPtr= GetPtr(gradOutput); resPool[dnnResourceDiffSrc] = (float*)primitives[BUFFER_POOLING_BACKWARD_INPUT]; resPool[dnnResourceDiffDst] = OutPtr; resPool[dnnResourceWorkspace] = (float*)primitives[BUFFER_POOLING_FORWARD_WORKSPACE]; //make conversion for gradeOut if necessary dnnPrimitive_t cv_out_b = (dnnPrimitive_t)(primitives[CV_POOLING_BACKWARD_OUTPUT]); if (cv_out_b) { float* buf_out_b = (float*)primitives[BUFFER_POOLING_BACKWARD_OUTPUT]; CHECK_ERR( dnnConversionExecute_F32(cv_out_b, OutPtr, buf_out_b), err ); resPool[dnnResourceDiffDst] = buf_out_b; } long long grad_in_len = (long long)dnnLayoutGetMemorySize_F32((dnnLayout_t)primitives[POOL_L_B_I]) ; float * tempPtr = (float*)primitives[BUFFER_POOLING_BACKWARD_INPUT]; #pragma omp parallel for for (long long i = 0; i < grad_in_len/4; ++i) { tempPtr[i] = 0; } CHECK_ERR( dnnExecute_F32((dnnPrimitive_t)primitives[POOLING_BACKWARD], (void**)resPool), err ); if(beta != 0.0) { //require to add previous delta long long* ptr_gradInput = (long long*)gradInput; float* pFirstBuf = GetPtr(gradInput); dnnLayout_t layout_pre_delta = (dnnLayout_t)ptr_gradInput[MKLLayout]; if(layout_pre_delta == NULL) layout_pre_delta = (dnnLayout_t)primitives[POOL_L_I]; dnnLayout_t layout_add_delta = (dnnLayout_t)primitives[POOL_L_B_I]; float* temp_memory = NULL; if (!dnnLayoutCompare_F32(layout_add_delta, layout_pre_delta)) { CHECK_ERR( dnnAllocateBuffer_F32((void**)&temp_memory, layout_add_delta) , err ); dnnPrimitive_t cv = NULL; CHECK_ERR( dnnConversionCreate_F32(&cv, layout_pre_delta, layout_add_delta), err ); CHECK_ERR( dnnConversionExecute_F32(cv, pFirstBuf, temp_memory), err ); pFirstBuf = temp_memory; } long len = (long long)dnnLayoutGetMemorySize_F32(layout_add_delta) / 4 ; cblas_saxpy(len, 1.0, pFirstBuf, 1, (float*)primitives[BUFFER_POOLING_BACKWARD_INPUT], 1); if (temp_memory != NULL) dnnReleaseBuffer_F32(temp_memory); } ((long long *)gradInput)[MKLLayout] = primitives[POOL_L_B_I]; ((long long *)gradInput)[MKLPtr] = primitives[BUFFER_POOLING_BACKWARD_INPUT]; ERR_RETURN: return; }
static void Conv_bfilter_init( long long * input, long long * gradOutput, long long * gradWeight, long long * primitives, int N, int oC, int oH, int oW) { dnnError_t err; //for gradOut dnnLayout_t lt_out = (dnnLayout_t)(gradOutput[MKLLayout]); if(lt_out==NULL) lt_out = (dnnLayout_t)primitives[L_O]; dnnPrimitive_t cv_out_bfilter = NULL; float* buf_out_bfilter = NULL; CHECK_ERR( try_convert(&cv_out_bfilter, &buf_out_bfilter, lt_out, (dnnLayout_t)primitives[L_BF_O]) , err ); primitives[CONVERT_BWDFILTER_OUTPUT] = (long long)cv_out_bfilter; primitives[BUFFER_BWDFILTER_OUTPUT] = (long long)buf_out_bfilter; //for the first layer without delta, input gradOut should first be transposed float* gradOutTransPtr = NULL; if ( gradOutput[MKLLayout] == 0 && primitives[BUFFER_TRANS_OUTPUT] == 0) { gradOutTransPtr = (float*)malloc(N*oC*oH*oW*sizeof(float)); primitives[BUFFER_TRANS_OUTPUT] = (long long)gradOutTransPtr; } //for filter dnnLayout_t lt_filter = (dnnLayout_t)primitives[L_W]; dnnLayout_t lt_filter_bfilter = (dnnLayout_t)primitives[L_BF_W]; dnnPrimitive_t cv_filter_bfilter = NULL; float * buf_filter_bfilter = NULL; if(!dnnLayoutCompare_F32(lt_filter_bfilter, lt_filter)) { CHECK_ERR( dnnConversionCreate_F32(&cv_filter_bfilter, lt_filter_bfilter, lt_filter), err); CHECK_ERR( dnnAllocateBuffer_F32((void**)&buf_filter_bfilter, lt_filter_bfilter), err); } primitives[BUFFER_BWDFILTER_FILTER] = (long long)buf_filter_bfilter; primitives[CONVERT_BWDFILTER_FILTER] = (long long)cv_filter_bfilter; //for input dnnLayout_t lt_in_real = (dnnLayout_t)input[MKLLayout]; if(lt_in_real==NULL) { lt_in_real = (dnnLayout_t)primitives[L_I]; } dnnLayout_t lt_in_bfilter = (dnnLayout_t)primitives[L_BF_I]; dnnPrimitive_t cv_in_bfilter = NULL; float* buf_in_bfilter = (float*)(input[CPUPtr]); CHECK_ERR( try_convert(&cv_in_bfilter, &buf_in_bfilter, lt_in_real, lt_in_bfilter), err ); primitives[BUFFER_BWDFILTER_INPUT] = (long long)buf_in_bfilter; primitives[CONVERT_BWDFILTER_INPUT] = (long long)cv_in_bfilter; //if has bias if (primitives[BDW_BIAS_INDEX] != 0) { //convert for grad_bias if necessary dnnLayout_t lt_bias_bias = (dnnLayout_t)primitives[L_B_B]; dnnLayout_t lt_bias = (dnnLayout_t)primitives[L_B]; dnnPrimitive_t cv_bias_bias = NULL; float * buf_bias_bias = NULL; CHECK_ERR( dnnConversionCreate_F32(&cv_bias_bias, lt_bias_bias, lt_bias), err); CHECK_ERR( dnnAllocateBuffer_F32((void**)&buf_bias_bias, lt_bias_bias), err); primitives[BUFFER_BIAS_BIAS] = (long long)buf_bias_bias; primitives[CV_BIAS_BIAS] = (long long)cv_bias_bias; //convert for grad_out if necessary dnnLayout_t lt_bias_out = (dnnLayout_t)primitives[L_B_O]; dnnPrimitive_t cv_out_bias = NULL; float* buf_out_bias = (float*)(input[CPUPtr]); CHECK_ERR( try_convert(&cv_out_bias, &buf_out_bias, lt_out, lt_bias_out), err ); primitives[BUFFER_BIAS_OUT] = (long long)buf_out_bias; primitives[CV_BIAS_OUT] = (long long)cv_out_bias; } ERR_RETURN: return; }