void ConvertToMKL(unsigned long long tensor_) { long long * tensor = (long long *)tensor_; if (tensor[CPUPtr] == 0 ) { printf("error to conver to MKL tensor!\n"); return; } if (tensor[MKLLayout] == 0) return;//do not need convert if (!dnnLayoutCompare_F32((dnnLayout_t)tensor[MKLLayout], (dnnLayout_t)tensor[CPULayout])) { dnnError_t err; dnnPrimitive_t cv; CHECK_ERR( dnnConversionCreate_F32(&cv, (dnnLayout_t)tensor[CPULayout], (dnnLayout_t)tensor[MKLLayout]), err ); CHECK_ERR( dnnConversionExecute_F32(cv, (float *)tensor[CPUPtr], (float *)tensor[MKLPtr]), err ); } else { memcpy((void*)tensor[MKLPtr], (void*)tensor[CPUPtr], dnnLayoutGetMemorySize_F32((dnnLayout_t)tensor[MKLLayout])); } ERR_RETURN: return; }
//convert tensor in MKL layout back to Numpy NCHW layout //if layout diff, do conversion, else, copy memory directly void ConvertBack(unsigned long long tensor_, int N, int C, int H, int W) { long long * tensor = (long long *)tensor_; if (tensor[CPUPtr] == 0 ) { printf("error to converback tensor!\n"); return; } if (tensor[MKLLayout] == 0) return;//do not need convert dnnError_t err; size_t inSize[DIM4] = { W, H, C, N}; size_t inStride[DIM4] = { 1, W, W*H, W*H*C}; dnnLayout_t lt_NCHW = NULL, lt_CHWN = NULL; float* newPtr = NULL; CHECK_ERR( dnnLayoutCreate_F32(<_NCHW, DIM4, inSize, inStride), err ); if (!dnnLayoutCompare_F32((dnnLayout_t)tensor[MKLLayout], (dnnLayout_t)tensor[CPULayout])) { float* cpuPtr = (float *)tensor[CPUPtr]; float* mklPtr = (float *)tensor[MKLPtr]; if (!dnnLayoutCompare_F32((dnnLayout_t)tensor[MKLLayout], lt_NCHW)) { dnnPrimitive_t cv; CHECK_ERR( dnnConversionCreate_F32(&cv, (dnnLayout_t)tensor[MKLLayout],lt_NCHW), err ); newPtr = (float*)malloc(N*C*H*W*sizeof(float)); CHECK_ERR( dnnConversionExecute_F32(cv, mklPtr, newPtr), err ); mklPtr = newPtr; } mkl_somatcopy('r', 't', N, C*H*W, 1.0, mklPtr, C*H*W, cpuPtr, N); } else { long long grad_in_len = (long long)dnnLayoutGetMemorySize_F32((dnnLayout_t)tensor[MKLLayout]) ; float * destPtr = (float*)tensor[CPUPtr]; float * srcPtr = (float*)tensor[MKLPtr]; #pragma omp parallel for for (long long i = 0; i < grad_in_len/4; ++i) { destPtr[i] = srcPtr[i]; } } ERR_RETURN: if (newPtr!=NULL) { free(newPtr); } }
void MaxPooling_bprop( unsigned long long gradOutput, //input, N*outC*outH*outW unsigned long long gradInput, //output result unsigned long long dnnprimitives, int initOK, const float beta) { dnnError_t err; long long* primitives = (long long*)dnnprimitives; if (initOK == 0) { Init_b((long long *)gradInput, (long long *)gradOutput, primitives); } //get resource float* resPool[dnnResourceNumber] = {0}; float* OutPtr= GetPtr(gradOutput); resPool[dnnResourceDiffSrc] = (float*)primitives[BUFFER_POOLING_BACKWARD_INPUT]; resPool[dnnResourceDiffDst] = OutPtr; resPool[dnnResourceWorkspace] = (float*)primitives[BUFFER_POOLING_FORWARD_WORKSPACE]; //make conversion for gradeOut if necessary dnnPrimitive_t cv_out_b = (dnnPrimitive_t)(primitives[CV_POOLING_BACKWARD_OUTPUT]); if (cv_out_b) { float* buf_out_b = (float*)primitives[BUFFER_POOLING_BACKWARD_OUTPUT]; CHECK_ERR( dnnConversionExecute_F32(cv_out_b, OutPtr, buf_out_b), err ); resPool[dnnResourceDiffDst] = buf_out_b; } long long grad_in_len = (long long)dnnLayoutGetMemorySize_F32((dnnLayout_t)primitives[POOL_L_B_I]) ; float * tempPtr = (float*)primitives[BUFFER_POOLING_BACKWARD_INPUT]; #pragma omp parallel for for (long long i = 0; i < grad_in_len/4; ++i) { tempPtr[i] = 0; } CHECK_ERR( dnnExecute_F32((dnnPrimitive_t)primitives[POOLING_BACKWARD], (void**)resPool), err ); if(beta != 0.0) { //require to add previous delta long long* ptr_gradInput = (long long*)gradInput; float* pFirstBuf = GetPtr(gradInput); dnnLayout_t layout_pre_delta = (dnnLayout_t)ptr_gradInput[MKLLayout]; if(layout_pre_delta == NULL) layout_pre_delta = (dnnLayout_t)primitives[POOL_L_I]; dnnLayout_t layout_add_delta = (dnnLayout_t)primitives[POOL_L_B_I]; float* temp_memory = NULL; if (!dnnLayoutCompare_F32(layout_add_delta, layout_pre_delta)) { CHECK_ERR( dnnAllocateBuffer_F32((void**)&temp_memory, layout_add_delta) , err ); dnnPrimitive_t cv = NULL; CHECK_ERR( dnnConversionCreate_F32(&cv, layout_pre_delta, layout_add_delta), err ); CHECK_ERR( dnnConversionExecute_F32(cv, pFirstBuf, temp_memory), err ); pFirstBuf = temp_memory; } long len = (long long)dnnLayoutGetMemorySize_F32(layout_add_delta) / 4 ; cblas_saxpy(len, 1.0, pFirstBuf, 1, (float*)primitives[BUFFER_POOLING_BACKWARD_INPUT], 1); if (temp_memory != NULL) dnnReleaseBuffer_F32(temp_memory); } ((long long *)gradInput)[MKLLayout] = primitives[POOL_L_B_I]; ((long long *)gradInput)[MKLPtr] = primitives[BUFFER_POOLING_BACKWARD_INPUT]; ERR_RETURN: return; }
static void THNN_(BatchNormalization_MKLDNN_init_forward)( THLongTensor *primitives, int N, int inC, int inH, int inW, double eps) { dnnError_t err; dnnPrimitive_t bn_forward = NULL; dnnPrimitive_t bn_backward = NULL; dnnPrimitive_t bn_bwd_scaleshift = NULL; size_t inputSize[dimension] = {inW,inH,inC,N}; size_t inputStrides[dimension] = { 1, inW, inH * inW, inC * inH * inW }; dnnLayout_t lt_user_input = NULL; if(primitives->storage->data[BN_LAYOUT_INPUT] == 0) { CHECK_ERR( dnnLayoutCreate_F32(<_user_input, dimension, inputSize, inputStrides) , err ); #if CONVERSION_LOG fprintf(stderr ,"MKLDNN BN get input layout FAIL......\n"); #endif } else{ lt_user_input = (dnnLayout_t)primitives->storage->data[BN_LAYOUT_INPUT]; #if CONVERSION_LOG fprintf(stderr ,"MKLDNN BN get input layout OK\n"); #endif } CHECK_ERR( dnnBatchNormalizationCreateForward_F32(&bn_forward,NULL,lt_user_input,eps), err ); CHECK_ERR( dnnBatchNormalizationCreateBackwardData_F32(&bn_backward,NULL,lt_user_input,eps), err ); CHECK_ERR( dnnBatchNormalizationCreateBackwardScaleShift_F32(&bn_bwd_scaleshift,NULL,lt_user_input,eps), err ); dnnLayout_t lt_bn_forward_workspace,lt_bn_forward_scaleshift,lt_bn_forward_output,lt_bn_backward_input; real * buffer_forward_workspace = NULL; real * buffer_forward_scaleshift = NULL; real * buffer_forward_output = NULL; real * buffer_backward_input = NULL; dnnLayoutCreateFromPrimitive_F32(<_bn_forward_workspace, bn_forward, dnnResourceWorkspace); dnnLayoutCreateFromPrimitive_F32(<_bn_forward_output, bn_forward, dnnResourceDst); dnnLayoutCreateFromPrimitive_F32(<_bn_forward_scaleshift, bn_forward, dnnResourceScaleShift); dnnLayoutCreateFromPrimitive_F32(<_bn_backward_input, bn_backward, dnnResourceDiffSrc); CHECK_ERR( dnnAllocateBuffer_F32((void**)(&buffer_forward_workspace), lt_bn_forward_workspace), err ); CHECK_ERR( dnnAllocateBuffer_F32((void**)(&buffer_forward_scaleshift), lt_bn_forward_scaleshift), err ); //CHECK_ERR( dnnAllocateBuffer_F32((void**)(&buffer_forward_output), lt_bn_forward_output), err ); //CHECK_ERR( dnnAllocateBuffer_F32((void**)(&buffer_backward_input), lt_bn_backward_input), err ); int size1 = dnnLayoutGetMemorySize_F32(lt_bn_forward_output); int size2 = inW*inH*inC*N*4; if(size1 == size2) { #if CONVERSION_LOG fprintf(stderr ,"MKLDNN BN forward ouput layout match OK\n"); #endif } else { CHECK_ERR( dnnAllocateBuffer_F32((void**)(&buffer_forward_output), lt_bn_forward_output), err ); fprintf(stderr ,"MKLDNN BN forward ouput layout match FAIL, size1 = %d, size2 = %d \n", size1, size2); } size1 = dnnLayoutGetMemorySize_F32(lt_bn_backward_input); if(size1 == size2) { #if CONVERSION_LOG fprintf(stderr ,"MKLDNN MaxPooling bwddata input layout match OK\n"); #endif } else { CHECK_ERR( dnnAllocateBuffer_F32((void**)(&buffer_backward_input), lt_bn_backward_input), err ); fprintf(stderr ,"MKLDNN MaxPooling bwddata input layout match FAIL, size1 = %d, size2 = %d \n", size1, size2); } //save the dnnPrimitive to THTensor(long int array) primitives->storage->data[BN_LAYOUT_FORWARD_OUTPUT] = (long long)lt_bn_forward_output; primitives->storage->data[BN_LAYOUT_BACKWARD_INPUT] = (long long)lt_bn_backward_input; primitives->storage->data[BN_FORWARD] = (long long)bn_forward; primitives->storage->data[BN_BACKWARD] = (long long)bn_backward; primitives->storage->data[BN_SCALESHIFT] = (long long)bn_bwd_scaleshift; primitives->storage->data[BUFFER_BN_FORWARD_WORKSPACE] = (long long)buffer_forward_workspace; primitives->storage->data[BUFFER_BN_FORWARD_SCALESHIFT] = (long long)buffer_forward_scaleshift; primitives->storage->data[BUFFER_BN_FORWARD_OUTPUT] = (long long)buffer_forward_output; primitives->storage->data[BUFFER_BN_BACKWARD_INPUT] = (long long)buffer_backward_input; primitives->storage->data[BUFFER_BN_BACKWARD_WORKSPACE] = (long long)buffer_forward_workspace; }