예제 #1
0
static void Init_b(long long * gradIn, long long * gradOut, long long * primitives)
{
    dnnError_t err;

    //gradOut, layout is user or mkl
    dnnLayout_t lt_out_b     = (dnnLayout_t)primitives[POOL_L_B_O];
    dnnLayout_t lt_out       = (dnnLayout_t)gradOut[MKLLayout];
    if (lt_out==NULL) lt_out  = (dnnLayout_t)primitives[POOL_L_O];
    //create conversion and buff if necessary
    dnnPrimitive_t cv_out_b = NULL;    float * buf_out_b = NULL;
    CHECK_ERR( try_convert(&cv_out_b, &buf_out_b, lt_out, lt_out_b) , err );
    //save
    primitives[CV_POOLING_BACKWARD_OUTPUT]      = (long long)cv_out_b;
    primitives[BUFFER_POOLING_BACKWARD_OUTPUT]  = (long long)buf_out_b;

    //gradIn, layout
    gradIn[CPULayout]   = primitives[POOL_L_I];
    dnnLayout_t lt_in_b = (dnnLayout_t)primitives[POOL_L_B_I];
    float* buf_in_b = NULL;
    CHECK_ERR( dnnAllocateBuffer_F32((void**)(&buf_in_b), lt_in_b), err );
    primitives[BUFFER_POOLING_BACKWARD_INPUT] = (long long)buf_in_b;

ERR_RETURN:
    return;
}
예제 #2
0
파일: conv.c 프로젝트: StevenLOL/neon
//gradOut: output gradient of CONV layer, known parameters
//gradIn:  input gradient, to be calculated
static void Conv_bdata_init(
  long long * gradIn,
  long long * gradOut,
  int N, int oC, int oH, int oW,
  long long * weight,
  long long * primitives)
{
    dnnError_t err;

    //get gradOut layout, create conversion if necessary
    dnnLayout_t lt_out  = (dnnLayout_t)gradOut[MKLLayout];
    if(lt_out==NULL)
    {
        lt_out  = (dnnLayout_t)primitives[L_O];
    }
    dnnPrimitive_t cv_out_bdata = NULL;	float * buf_out_bdata = NULL;
    CHECK_ERR( try_convert(&cv_out_bdata, &buf_out_bdata, lt_out, (dnnLayout_t)primitives[L_BD_O]) , err );
	primitives[CONVERT_BWDDATA_OUTPUT] = (long long)cv_out_bdata;
	primitives[BUFFER_BWDDATA_OUTPUT]  = (long long)buf_out_bdata;

	//for filter
	dnnLayout_t lt_filter       = (dnnLayout_t)primitives[L_W];
	dnnLayout_t lt_filter_bdata = (dnnLayout_t)primitives[L_BD_W];
    dnnPrimitive_t cv_filter_bdata = NULL; float * buf_filter_bdata = NULL;
    CHECK_ERR( try_convert(&cv_filter_bdata, &buf_filter_bdata, lt_filter, lt_filter_bdata), err );
	primitives[BUFFER_BWDDATA_FILTER]  = (long long)buf_filter_bdata;
	primitives[CONVERT_BWDDATA_FILTER] = (long long)cv_filter_bdata;

    //create gradInput layout and memory
    dnnLayout_t lt_in_bdata = (dnnLayout_t)primitives[L_BD_I];
    float * buf_in_bdata = (float*)(gradIn[CPUPtr]);
    CHECK_ERR( dnnAllocateBuffer_F32((void**)(&buf_in_bdata), lt_in_bdata), err );
    primitives[BUFFER_BWDDATA_INPUT] = (long long)buf_in_bdata;
    gradIn[CPULayout] = (long long)(dnnLayout_t)primitives[L_I_CHWN];

    float* gradOutTransPtr = NULL;
    if (gradOut[MKLLayout] == 0)
	{
		gradOutTransPtr = (float*)malloc(N*oC*oH*oW*sizeof(float));
	}
	primitives[BUFFER_TRANS_OUTPUT] = (long long)gradOutTransPtr;

ERR_RETURN:
    return;
}
예제 #3
0
void MaxPooling_bprop(
    unsigned long long gradOutput,  //input, N*outC*outH*outW
    unsigned long long gradInput,   //output result
    unsigned long long dnnprimitives,
    int initOK, const float beta)
{
    dnnError_t err;
    long long* primitives = (long long*)dnnprimitives;
    if (initOK == 0)
    {
        Init_b((long long *)gradInput, (long long *)gradOutput, primitives);
    }

    //get resource
    float* resPool[dnnResourceNumber] = {0};
    float* OutPtr= GetPtr(gradOutput);

    resPool[dnnResourceDiffSrc]   = (float*)primitives[BUFFER_POOLING_BACKWARD_INPUT];
    resPool[dnnResourceDiffDst]   = OutPtr;
    resPool[dnnResourceWorkspace] = (float*)primitives[BUFFER_POOLING_FORWARD_WORKSPACE];

    //make conversion for gradeOut if necessary
    dnnPrimitive_t cv_out_b = (dnnPrimitive_t)(primitives[CV_POOLING_BACKWARD_OUTPUT]);
    if (cv_out_b)
    {
        float* buf_out_b = (float*)primitives[BUFFER_POOLING_BACKWARD_OUTPUT];
        CHECK_ERR( dnnConversionExecute_F32(cv_out_b, OutPtr, buf_out_b), err );
        resPool[dnnResourceDiffDst] = buf_out_b;
    }

    long long grad_in_len = (long long)dnnLayoutGetMemorySize_F32((dnnLayout_t)primitives[POOL_L_B_I]) ;
    float * tempPtr = (float*)primitives[BUFFER_POOLING_BACKWARD_INPUT];
    #pragma omp parallel for
    for (long long i = 0; i < grad_in_len/4; ++i)
    {
        tempPtr[i] = 0;
    }

    CHECK_ERR( dnnExecute_F32((dnnPrimitive_t)primitives[POOLING_BACKWARD], (void**)resPool), err );

    if(beta != 0.0)
    {
        //require to add previous delta
        long long* ptr_gradInput = (long long*)gradInput;
        float* pFirstBuf = GetPtr(gradInput);
        dnnLayout_t layout_pre_delta = (dnnLayout_t)ptr_gradInput[MKLLayout];
        if(layout_pre_delta == NULL) layout_pre_delta = (dnnLayout_t)primitives[POOL_L_I];
        dnnLayout_t layout_add_delta = (dnnLayout_t)primitives[POOL_L_B_I];
        float* temp_memory = NULL;
        if (!dnnLayoutCompare_F32(layout_add_delta, layout_pre_delta))
        {
            CHECK_ERR( dnnAllocateBuffer_F32((void**)&temp_memory, layout_add_delta) , err );
            dnnPrimitive_t cv = NULL;
            CHECK_ERR( dnnConversionCreate_F32(&cv, layout_pre_delta, layout_add_delta), err );
            CHECK_ERR( dnnConversionExecute_F32(cv, pFirstBuf, temp_memory), err );
            pFirstBuf = temp_memory;
        }
        long len = (long long)dnnLayoutGetMemorySize_F32(layout_add_delta) / 4 ;
        cblas_saxpy(len, 1.0, pFirstBuf, 1, (float*)primitives[BUFFER_POOLING_BACKWARD_INPUT], 1);
        if (temp_memory != NULL)
            dnnReleaseBuffer_F32(temp_memory);
    }

    ((long long *)gradInput)[MKLLayout] = primitives[POOL_L_B_I];
    ((long long *)gradInput)[MKLPtr]    = primitives[BUFFER_POOLING_BACKWARD_INPUT];

ERR_RETURN:
    return;
}
예제 #4
0
//useMaxPooling, or averagePooling
//useCaffe use ceil mode for pooling output dim
static void Init_f(
    long long * input,
    long long * output,
    long long * primitives,
    int N, int inC, int inH, int inW,
    int kH, int kW, int dH, int dW,
    int padH,int padW,
    int outC, int outH,int outW,
    int useMaxPooling,
    int useCaffe)
{

    dnnError_t err;

    //dimension
    size_t inputSize[DIM4]     = { inW,  inH,  inC, N};
    size_t outputSize[DIM4]    = {outW, outH, outC, N};
    size_t inputStrides1[DIM4]  = {1,  inW,   inW*inH,    inW*inH*inC};
    size_t outputStrides1[DIM4] = {1, outW, outW*outH, outW*outH*outC};

    //CHWN
    size_t inputStrides[DIM4]  = {N,  N*inW,   N*inW*inH,    1};
    size_t outputStrides[DIM4] = {N, N*outW, N*outW*outH, 1};

    size_t kernelSize[2]       = {   kW,   kH};
    size_t kernelStride[2]     = {   dW,   dH};

    //calculate pad
    int padH2 = (outH-1)*dH + kH - inH - padH;
    int padW2 = (outW-1)*dW + kW - inW - padW;
    int symm = 0;
    if (padH2==padH && padW2==padW) symm = 1;
    if (padH2<0)    padH2 = 0;
    if (padW2<0)    padW2 = 0;

    int pad_dim4[DIM4]      = {-padW, -padH, -padW2,-padH2};
    int pad_dim2[DIM2]      = {-padW, -padH};
    int inputOffset[DIM2]   = {    0,    0};

    //create user layout
    dnnLayout_t lt_out = NULL, lt_in = NULL;
    CHECK_ERR( dnnLayoutCreate_F32(&lt_in,  DIM4,  inputSize,  inputStrides) , err );
    CHECK_ERR( dnnLayoutCreate_F32(&lt_out, DIM4, outputSize, outputStrides) , err );
    primitives[POOL_L_I] = (long long)lt_in;
    primitives[POOL_L_O] = (long long)lt_out;

    //create MKL input layout
    dnnLayout_t lt_in_f = (dnnLayout_t)input[MKLLayout];
    if(lt_in_f==NULL)
    {
        lt_in_f = lt_in;
    }
    primitives[POOL_L_F_I] = (long long)lt_in_f;

    //create operation
    dnnPrimitive_t pool_f = NULL, pool_b = NULL;
    dnnPrimitiveAttributes_t attributes = NULL;
    CHECK_ERR( dnnPrimitiveAttributesCreate_F32(&attributes), err );

    if (useMaxPooling==1)
    {
        if(useCaffe || symm)
        {
            CHECK_ERR( dnnPoolingCreateForward_F32 (&pool_f, attributes, dnnAlgorithmPoolingMax,lt_in_f, kernelSize, kernelStride, pad_dim2, dnnBorderZeros), err );
            CHECK_ERR( dnnPoolingCreateBackward_F32(&pool_b, attributes, dnnAlgorithmPoolingMax,lt_in_f, kernelSize, kernelStride, pad_dim2, dnnBorderZeros), err );
        }
        else
        {
            CHECK_ERR( dnnPoolingCreateForward_F32 (&pool_f, attributes, dnnAlgorithmPoolingMax,lt_in_f, kernelSize, kernelStride, pad_dim4, dnnBorderZerosAsymm), err );
            CHECK_ERR( dnnPoolingCreateBackward_F32(&pool_b, attributes, dnnAlgorithmPoolingMax,lt_in_f, kernelSize, kernelStride, pad_dim4, dnnBorderZerosAsymm), err );
        }
    }
    else
    {
        if(useCaffe || symm)
        {
            CHECK_ERR( dnnPoolingCreateForward_F32 (&pool_f, attributes, dnnAlgorithmPoolingAvg,lt_in_f, kernelSize, kernelStride, pad_dim2, dnnBorderZeros), err );
            CHECK_ERR( dnnPoolingCreateBackward_F32(&pool_b, attributes, dnnAlgorithmPoolingAvg,lt_in_f, kernelSize, kernelStride, pad_dim2, dnnBorderZeros), err );

        }
        else
        {
            CHECK_ERR( dnnPoolingCreateForward_F32 (&pool_f, attributes, dnnAlgorithmPoolingAvg,lt_in_f, kernelSize, kernelStride, pad_dim4, dnnBorderZerosAsymm), err );
            CHECK_ERR( dnnPoolingCreateBackward_F32(&pool_b, attributes, dnnAlgorithmPoolingAvg,lt_in_f, kernelSize, kernelStride, pad_dim4, dnnBorderZerosAsymm), err );
        }
    }
    primitives[POOLING_FORWARD]  = (long long)pool_f;
    primitives[POOLING_BACKWARD] = (long long)pool_b;

    //create mkl layout for output
    dnnLayout_t lt_out_f = NULL, lt_out_b = NULL, lt_in_b = NULL;
    CHECK_ERR( dnnLayoutCreateFromPrimitive_F32(&lt_out_f, pool_f, dnnResourceDst),   err );
    CHECK_ERR( dnnLayoutCreateFromPrimitive_F32(&lt_in_b,  pool_f, dnnResourceSrc),   err );
    CHECK_ERR( dnnLayoutCreateFromPrimitive_F32(&lt_out_b, pool_f, dnnResourceDst),   err );
    primitives[POOL_L_F_O] = (long long)lt_out_f;
    primitives[POOL_L_B_I] = (long long)lt_in_b;
    primitives[POOL_L_B_O] = (long long)lt_out_b;

    //create work space , to record max location?
    dnnLayout_t lt_space = NULL; float* buf_space = NULL;
    CHECK_ERR( dnnLayoutCreateFromPrimitive_F32(&lt_space, pool_f, dnnResourceWorkspace), err );
    CHECK_ERR( dnnAllocateBuffer_F32((void**)&buf_space, lt_space) , err );
    primitives[BUFFER_POOLING_FORWARD_WORKSPACE] = (long long)buf_space;

    //output layout
    output[CPULayout] = (long long)lt_out;
    float* buf_out_f = NULL;
    CHECK_ERR( dnnAllocateBuffer_F32((void**)(&buf_out_f), lt_out_f), err );
    primitives[BUFFER_POOLING_FORWARD_OUTPUT] = (long long)buf_out_f;

ERR_RETURN:
    return;
}
예제 #5
0
static void THNN_(BatchNormalization_MKLDNN_init_forward)(
          THLongTensor *primitives,
          int N,
          int inC,
          int inH,
          int inW,
	  double eps)
{
	dnnError_t err;
	dnnPrimitive_t bn_forward = NULL;
	dnnPrimitive_t bn_backward = NULL;
	dnnPrimitive_t bn_bwd_scaleshift = NULL;

	size_t inputSize[dimension] = 	{inW,inH,inC,N};
	size_t inputStrides[dimension] = { 1, inW, inH * inW, inC * inH * inW };

	dnnLayout_t lt_user_input = NULL;

	if(primitives->storage->data[BN_LAYOUT_INPUT] == 0)
	{
		CHECK_ERR( dnnLayoutCreate_F32(&lt_user_input, dimension, inputSize, inputStrides) , err );
#if CONVERSION_LOG
		fprintf(stderr ,"MKLDNN BN get input layout FAIL......\n");
#endif
	}
	else{
		lt_user_input = (dnnLayout_t)primitives->storage->data[BN_LAYOUT_INPUT];
#if CONVERSION_LOG
		fprintf(stderr ,"MKLDNN BN get input layout OK\n");
#endif
	}

	CHECK_ERR( dnnBatchNormalizationCreateForward_F32(&bn_forward,NULL,lt_user_input,eps), err );
	CHECK_ERR( dnnBatchNormalizationCreateBackwardData_F32(&bn_backward,NULL,lt_user_input,eps), err );
	CHECK_ERR( dnnBatchNormalizationCreateBackwardScaleShift_F32(&bn_bwd_scaleshift,NULL,lt_user_input,eps), err );
	

	dnnLayout_t lt_bn_forward_workspace,lt_bn_forward_scaleshift,lt_bn_forward_output,lt_bn_backward_input;
	real * buffer_forward_workspace = NULL; real * buffer_forward_scaleshift = NULL;
	real * buffer_forward_output = NULL; real * buffer_backward_input = NULL;
	dnnLayoutCreateFromPrimitive_F32(&lt_bn_forward_workspace, bn_forward, dnnResourceWorkspace);
	dnnLayoutCreateFromPrimitive_F32(&lt_bn_forward_output, bn_forward, dnnResourceDst);
	dnnLayoutCreateFromPrimitive_F32(&lt_bn_forward_scaleshift, bn_forward, dnnResourceScaleShift);
	dnnLayoutCreateFromPrimitive_F32(&lt_bn_backward_input, bn_backward, dnnResourceDiffSrc);
		

	CHECK_ERR( dnnAllocateBuffer_F32((void**)(&buffer_forward_workspace), lt_bn_forward_workspace), err );
	CHECK_ERR( dnnAllocateBuffer_F32((void**)(&buffer_forward_scaleshift), lt_bn_forward_scaleshift), err );
	//CHECK_ERR( dnnAllocateBuffer_F32((void**)(&buffer_forward_output), lt_bn_forward_output), err );
	//CHECK_ERR( dnnAllocateBuffer_F32((void**)(&buffer_backward_input), lt_bn_backward_input), err );

	int size1 = dnnLayoutGetMemorySize_F32(lt_bn_forward_output);
	int size2 = inW*inH*inC*N*4;
	if(size1 == size2)
	{
#if CONVERSION_LOG
		fprintf(stderr ,"MKLDNN BN forward ouput layout match OK\n");
#endif
	}
	else
	{
		CHECK_ERR( dnnAllocateBuffer_F32((void**)(&buffer_forward_output), lt_bn_forward_output), err );
		fprintf(stderr ,"MKLDNN BN forward ouput layout match FAIL, size1 = %d, size2 = %d \n", size1, size2);
	}

	size1 = dnnLayoutGetMemorySize_F32(lt_bn_backward_input);
	if(size1 == size2)
	{
#if CONVERSION_LOG
		fprintf(stderr ,"MKLDNN MaxPooling bwddata input layout match OK\n");
#endif
	}
	else
	{
		CHECK_ERR( dnnAllocateBuffer_F32((void**)(&buffer_backward_input), lt_bn_backward_input), err );
		fprintf(stderr ,"MKLDNN MaxPooling bwddata input layout match FAIL, size1 = %d, size2 = %d \n", size1, size2);
	}

	//save the dnnPrimitive to THTensor(long int array)
	primitives->storage->data[BN_LAYOUT_FORWARD_OUTPUT] = (long long)lt_bn_forward_output;
	primitives->storage->data[BN_LAYOUT_BACKWARD_INPUT] = (long long)lt_bn_backward_input;

	primitives->storage->data[BN_FORWARD] = (long long)bn_forward;
	primitives->storage->data[BN_BACKWARD] = (long long)bn_backward;
	primitives->storage->data[BN_SCALESHIFT] = (long long)bn_bwd_scaleshift;
	primitives->storage->data[BUFFER_BN_FORWARD_WORKSPACE] = (long long)buffer_forward_workspace;
	primitives->storage->data[BUFFER_BN_FORWARD_SCALESHIFT] = (long long)buffer_forward_scaleshift;
	primitives->storage->data[BUFFER_BN_FORWARD_OUTPUT] = (long long)buffer_forward_output;
	primitives->storage->data[BUFFER_BN_BACKWARD_INPUT] = (long long)buffer_backward_input;
	primitives->storage->data[BUFFER_BN_BACKWARD_WORKSPACE] = (long long)buffer_forward_workspace;
}
예제 #6
0
파일: conv.c 프로젝트: StevenLOL/neon
static void Conv_bfilter_init(
  long long * input,
  long long * gradOutput,
  long long * gradWeight,
  long long * primitives,
  int N, int oC, int oH, int oW)
{
    dnnError_t err;

    //for gradOut
    dnnLayout_t lt_out = (dnnLayout_t)(gradOutput[MKLLayout]);
    if(lt_out==NULL) lt_out = (dnnLayout_t)primitives[L_O];
    dnnPrimitive_t cv_out_bfilter = NULL; float* buf_out_bfilter = NULL;
    CHECK_ERR( try_convert(&cv_out_bfilter, &buf_out_bfilter, lt_out, (dnnLayout_t)primitives[L_BF_O]) , err );
	primitives[CONVERT_BWDFILTER_OUTPUT] = (long long)cv_out_bfilter;
	primitives[BUFFER_BWDFILTER_OUTPUT]  = (long long)buf_out_bfilter;

	//for the first layer without delta, input gradOut should first be transposed
    float* gradOutTransPtr = NULL;
    if ( gradOutput[MKLLayout] == 0 && primitives[BUFFER_TRANS_OUTPUT] == 0)
	{
		gradOutTransPtr = (float*)malloc(N*oC*oH*oW*sizeof(float));
		primitives[BUFFER_TRANS_OUTPUT] = (long long)gradOutTransPtr;
	}

    //for filter
	dnnLayout_t lt_filter         = (dnnLayout_t)primitives[L_W];
	dnnLayout_t lt_filter_bfilter = (dnnLayout_t)primitives[L_BF_W];
    dnnPrimitive_t cv_filter_bfilter = NULL; float * buf_filter_bfilter = NULL;
    if(!dnnLayoutCompare_F32(lt_filter_bfilter, lt_filter))
    {
        CHECK_ERR( dnnConversionCreate_F32(&cv_filter_bfilter, lt_filter_bfilter, lt_filter), err);
		CHECK_ERR( dnnAllocateBuffer_F32((void**)&buf_filter_bfilter, lt_filter_bfilter), err);
    }
    primitives[BUFFER_BWDFILTER_FILTER]  = (long long)buf_filter_bfilter;
	primitives[CONVERT_BWDFILTER_FILTER] = (long long)cv_filter_bfilter;

    //for input
    dnnLayout_t lt_in_real = (dnnLayout_t)input[MKLLayout];
    if(lt_in_real==NULL)
    {
        lt_in_real = (dnnLayout_t)primitives[L_I];
    }
    dnnLayout_t lt_in_bfilter = (dnnLayout_t)primitives[L_BF_I];
    dnnPrimitive_t cv_in_bfilter = NULL; float* buf_in_bfilter = (float*)(input[CPUPtr]);
    CHECK_ERR( try_convert(&cv_in_bfilter, &buf_in_bfilter, lt_in_real, lt_in_bfilter), err );
    primitives[BUFFER_BWDFILTER_INPUT]  = (long long)buf_in_bfilter;
    primitives[CONVERT_BWDFILTER_INPUT] = (long long)cv_in_bfilter;

    //if has bias
    if (primitives[BDW_BIAS_INDEX] != 0)
    {
        //convert for grad_bias if necessary
        dnnLayout_t lt_bias_bias  = (dnnLayout_t)primitives[L_B_B];
        dnnLayout_t lt_bias       = (dnnLayout_t)primitives[L_B];
        dnnPrimitive_t cv_bias_bias = NULL; float * buf_bias_bias = NULL;
        CHECK_ERR( dnnConversionCreate_F32(&cv_bias_bias, lt_bias_bias, lt_bias), err);
		CHECK_ERR( dnnAllocateBuffer_F32((void**)&buf_bias_bias, lt_bias_bias), err);
        primitives[BUFFER_BIAS_BIAS]  = (long long)buf_bias_bias;
        primitives[CV_BIAS_BIAS] =      (long long)cv_bias_bias;

        //convert for grad_out if necessary
        dnnLayout_t lt_bias_out = (dnnLayout_t)primitives[L_B_O];
        dnnPrimitive_t cv_out_bias = NULL; float* buf_out_bias = (float*)(input[CPUPtr]);
        CHECK_ERR( try_convert(&cv_out_bias, &buf_out_bias, lt_out, lt_bias_out), err );
        primitives[BUFFER_BIAS_OUT]  = (long long)buf_out_bias;
        primitives[CV_BIAS_OUT] = (long long)cv_out_bias;
    }


ERR_RETURN:
    return;
}
예제 #7
0
파일: conv.c 프로젝트: StevenLOL/neon
static int Conv_f_init(
  long long * input,
  long long * output,
  long long * weight,
  long long * primitives,
  int N, int inC, int inH, int inW,
  int kH, int kW,
  int dH, int dW,
  int padH, int padW,
  int outC, int outH, int outW,
  int hasBias)
{
    dnnError_t err;

    //init dimensions
	size_t inputSize[DIM4]  =   { inW,  inH,  inC, N};
	size_t outputSize[DIM4] =   {outW, outH, outC, N};
	size_t filterSize[DIM4] =   {  kW,   kH,  inC, outC};
	size_t stride[DIM2]     =   {  dW,   dH};
	int    pad[DIM2]        =   {-padW, -padH};
	size_t biasSize[1]      =   {outC};
	size_t biasStrides[1]   =   { 1 };

    //using NCHW layout
	size_t filterStridesNCHW[DIM4] = {1,  kW,      kW*kH,     kW*kH*inC};
    size_t inputStridesNCHW[DIM4]  = {1,  inW,   inW*inH,    inW*inH*inC};
    size_t outputStridesNCHW[DIM4] = {1, outW, outW*outH, outW*outH*outC};

    //CHWN
    size_t filterStridesCHWN[DIM4] = {outC,  outC*kW,      outC*kW*kH,     1};
    size_t inputStridesCHWN[DIM4]  = {N,  N*inW,   N*inW*inH,    1};
    size_t outputStridesCHWN[DIM4] = {N,  N*outW,  N*outW*outH,  1};

    //create execute and save into primitives
    dnnPrimitiveAttributes_t attributes = NULL;
	CHECK_ERR( dnnPrimitiveAttributesCreate_F32(&attributes), err );
	dnnPrimitive_t conv_f       = NULL;    //forward operation
	dnnPrimitive_t conv_bdata   = NULL;    //backward calculate gradient input
	dnnPrimitive_t conv_bfilter = NULL;    //backward calculate gradient filter(weight)
    dnnPrimitive_t conv_b_bias  = NULL;    //backward bias

    //create layout and save
    //lt_in, layout of input in NCHW form
    //lt_filter_f, required layout (MKL layout) for forward for weight
    //lt_out_bfilter, required layout for backward weight update for output
	dnnLayout_t lt_in_NCHW,    lt_filter,         lt_out_NCHW,   lt_in_CHWN,  lt_out_CHWN, lt_bias_CHWN=NULL;
	dnnLayout_t lt_in_f,       lt_filter_f,       lt_out_f,      lt_bias_f;
	dnnLayout_t lt_in_bdata,   lt_filter_bdata,   lt_out_bdata,  lt_bias_bdata;
    dnnLayout_t lt_in_bfilter, lt_filter_bfilter, lt_out_bfilter,lt_bias_bias, lt_out_bias;

 	if (hasBias)
 	{
 	    CHECK_ERR(dnnConvolutionCreateForwardBias_F32(   &conv_f,   attributes, dnnAlgorithmConvolutionDirect, DIM4, inputSize, outputSize, filterSize, stride, pad, dnnBorderZeros),err);
 	    CHECK_ERR(dnnConvolutionCreateForwardBias_F32(   &conv_f,   attributes, dnnAlgorithmConvolutionDirect, DIM4, inputSize, outputSize, filterSize, stride, pad, dnnBorderZeros),err);
        CHECK_ERR( dnnLayoutCreate_F32(&lt_bias_CHWN, 1, biasSize, biasStrides), err );
    	CHECK_ERR( dnnLayoutCreateFromPrimitive_F32(&lt_bias_f,   conv_f, dnnResourceBias   ) , err );
        CHECK_ERR(dnnConvolutionCreateBackwardBias_F32(  &conv_b_bias,  attributes, dnnAlgorithmConvolutionDirect, DIM4, outputSize),err);
        CHECK_ERR( dnnLayoutCreateFromPrimitive_F32(&lt_bias_bias,   conv_b_bias,  dnnResourceDiffBias) , err );
        CHECK_ERR( dnnLayoutCreateFromPrimitive_F32(&lt_out_bias,    conv_b_bias,  dnnResourceDiffDst) , err );
    }
    else
    	CHECK_ERR(dnnConvolutionCreateForward_F32(       &conv_f,   attributes, dnnAlgorithmConvolutionDirect, DIM4, inputSize, outputSize, filterSize, stride, pad, dnnBorderZeros),err);
 	CHECK_ERR(dnnConvolutionCreateBackwardData_F32(  &conv_bdata,   attributes, dnnAlgorithmConvolutionDirect, DIM4, inputSize, outputSize, filterSize, stride, pad, dnnBorderZeros),err);
	CHECK_ERR(dnnConvolutionCreateBackwardFilter_F32(&conv_bfilter, attributes, dnnAlgorithmConvolutionDirect, DIM4, inputSize, outputSize, filterSize, stride, pad, dnnBorderZeros),err);

    primitives[FORWARD_INDEX]    = (long long)conv_f;
    primitives[BWD_DATA_INDEX]   = (long long)conv_bdata;
    primitives[BWD_FILTER_INDEX] = (long long)conv_bfilter;
    primitives[BDW_BIAS_INDEX]   = (long long)conv_b_bias;

	CHECK_ERR( dnnLayoutCreate_F32(&lt_in_NCHW,     DIM4, inputSize,  inputStridesNCHW),  err );
	CHECK_ERR( dnnLayoutCreate_F32(&lt_in_CHWN,     DIM4, inputSize,  inputStridesCHWN),  err );
	CHECK_ERR( dnnLayoutCreate_F32(&lt_filter,      DIM4, filterSize, filterStridesCHWN), err );
    CHECK_ERR( dnnLayoutCreate_F32(&lt_out_NCHW,    DIM4, outputSize, outputStridesNCHW), err );
    CHECK_ERR( dnnLayoutCreate_F32(&lt_out_CHWN,    DIM4, outputSize, outputStridesCHWN), err );

    CHECK_ERR( dnnLayoutCreateFromPrimitive_F32(&lt_in_f,     conv_f, dnnResourceSrc   ) , err );
	CHECK_ERR( dnnLayoutCreateFromPrimitive_F32(&lt_filter_f, conv_f, dnnResourceFilter), err );
	CHECK_ERR( dnnLayoutCreateFromPrimitive_F32(&lt_out_f,    conv_f, dnnResourceDst   ) , err );
	CHECK_ERR( dnnLayoutCreateFromPrimitive_F32(&lt_in_bdata,     conv_bdata, dnnResourceDiffSrc) , err );
	CHECK_ERR( dnnLayoutCreateFromPrimitive_F32(&lt_filter_bdata, conv_bdata, dnnResourceFilter) , err );
	CHECK_ERR( dnnLayoutCreateFromPrimitive_F32(&lt_out_bdata,    conv_bdata, dnnResourceDiffDst) , err );
	CHECK_ERR( dnnLayoutCreateFromPrimitive_F32(&lt_in_bfilter,     conv_bfilter, dnnResourceSrc) , err );
	CHECK_ERR( dnnLayoutCreateFromPrimitive_F32(&lt_filter_bfilter, conv_bfilter, dnnResourceDiffFilter) , err );
    CHECK_ERR( dnnLayoutCreateFromPrimitive_F32(&lt_out_bfilter,    conv_bfilter, dnnResourceDiffDst) , err );

    //here assume NCHW (CHWN will be transposed)
    primitives[L_I]    = (long long)lt_in_NCHW;
    primitives[L_O]    = (long long)lt_out_NCHW;
    primitives[L_W]    = (long long)lt_filter;
    primitives[L_B]    = (long long)lt_bias_CHWN;
    primitives[L_F_I]  = (long long)lt_in_f;
    primitives[L_F_O]  = (long long)lt_out_f;
    primitives[L_F_W]  = (long long)lt_filter_f;
    primitives[L_F_B]  = (long long)lt_bias_f;
    primitives[L_BD_I] = (long long)lt_in_bdata;
    primitives[L_BD_O] = (long long)lt_out_bdata;
    primitives[L_BD_W] = (long long)lt_filter_bdata;
    primitives[L_BF_I] = (long long)lt_in_bfilter;
    primitives[L_BF_O] = (long long)lt_out_bfilter;
    primitives[L_BF_W] = (long long)lt_filter_bfilter;
    primitives[L_I_CHWN] = (long long)lt_in_CHWN;
    primitives[L_O_CHWN] = (long long)lt_out_CHWN;
    primitives[L_B_B]    = (long long)lt_bias_bias;
    primitives[L_B_O]    = (long long)lt_out_bias;
    //input may have user layout (from raw image data,continuous NCHW )
    //  or maybe mkl layout (is previous mkl-based layer's output)
    dnnLayout_t lt_in_real = (dnnLayout_t)input[MKLLayout];
    if(lt_in_real==NULL) lt_in_real = lt_in_NCHW;
    //create conversion and buff if necessary
    dnnPrimitive_t cv_in_f = NULL; float * buf_in_f = NULL;
    CHECK_ERR( try_convert(&cv_in_f, &buf_in_f, lt_in_real, lt_in_f) , err );

    //create transpose if necessary
    float* newPtr = NULL;
	if (input[MKLLayout] == 0)
	{
		newPtr = (float*)malloc(inC*inH*inW*N*sizeof(float));
	}
    primitives[BUFFER_TRANS_INPUT] = (long long)newPtr;

    //save conversion and buff
    primitives[BUFFER_FORWARD_INPUT]  = (long long)buf_in_f;
    primitives[CONVERT_FORWARD_INPUT] = (long long)cv_in_f;

    //filter layout
    dnnPrimitive_t cv_filter_f = NULL; float * buf_filter_f = NULL;
    CHECK_ERR( try_convert(&cv_filter_f, &buf_filter_f, lt_filter, lt_filter_f), err );
    primitives[CONVERT_FORWARD_FILTER] = (long long)cv_filter_f;
    primitives[BUFFER_FORWARD_FILTER]  = (long long)buf_filter_f;

	//save user layout for output, and create mkl buffer
	//output always has mkl buffer and recorded in layer's primitive
	output[CPULayout] = (long long)lt_out_CHWN;
	float* buf_out_f = NULL;
    CHECK_ERR( dnnAllocateBuffer_F32((void**)(&buf_out_f), lt_out_f), err );
    primitives[BUFFER_FORWARD_OUTPUT] = (long long)buf_out_f;

    //for bias
    dnnPrimitive_t cv_bias_f = NULL; float * buf_bias_f = NULL;
    dnnPrimitive_t cv_bias_b = NULL; float * buf_bias_b = NULL;
    if (hasBias)
    {
        CHECK_ERR( try_convert(&cv_bias_f, &buf_bias_f, lt_bias_CHWN, lt_bias_f), err );
    }
    primitives[CONVERT_FORWARD_BIAS] = (long long)cv_bias_f;
    primitives[BUFFER_FORWARD_BIAS]  = (long long)buf_bias_f;

    return 0;

ERR_RETURN:

    return 1;
}