예제 #1
0
파일: conv.c 프로젝트: StevenLOL/neon
//gradOutput, conv output gradient, also as the input
//gradInput, to be calculated
void Conv_bwdData(
  unsigned long long gradOutput,
  unsigned long long gradInput,
  unsigned long long weight,
  unsigned long long dnnprimitives,
  int N, int oC, int oH, int oW,
  int initOk, float beta)
{
    dnnError_t err;
    long long * primitives = (long long *)dnnprimitives;

	if(initOk == 0)
	{
		Conv_bdata_init((long long *)gradInput,	(long long *)gradOutput, N, oC, oH, oW, (long long *)weight, primitives);
	}

    //get resource
    float* inPtr     = (float*)primitives[BUFFER_BWDDATA_INPUT];
	float* outPtr    = GetPtr(gradOutput);
	float* filterPtr = GetPtr(weight);

	float * resConv[dnnResourceNumber]={0};
	resConv[dnnResourceDiffSrc] = inPtr;
	resConv[dnnResourceFilter]  = filterPtr;

    //do transpose if necessary
    float* newPtr = (float*)primitives[BUFFER_TRANS_OUTPUT];
    if (newPtr!=NULL)
    {
		mkl_somatcopy('r', 't', oC*oH*oW, N, 1.0, outPtr, N, newPtr, oC*oH*oW);
        outPtr = newPtr;
    }
	resConv[dnnResourceDiffDst] = outPtr;

	//do conversion if necessary
	dnnPrimitive_t cv_out_bdata = (dnnPrimitive_t)primitives[CONVERT_BWDDATA_OUTPUT];
	if (cv_out_bdata)
	{
		float* buf_out_bdata = (float *)(primitives[BUFFER_BWDDATA_OUTPUT]);
		CHECK_ERR( dnnConversionExecute_F32(cv_out_bdata, outPtr, buf_out_bdata), err );
		resConv[dnnResourceDiffDst] = buf_out_bdata;
	}

	dnnPrimitive_t cv_filter_bdata = (dnnPrimitive_t)primitives[CONVERT_BWDDATA_FILTER];
	if (cv_filter_bdata)
	{
		float* buf_filter_bdata = (float *)(primitives[BUFFER_BWDDATA_FILTER]);
		CHECK_ERR( dnnConversionExecute_F32(cv_filter_bdata, filterPtr, buf_filter_bdata), err );
	    resConv[dnnResourceFilter] = buf_filter_bdata;
	}

	CHECK_ERR(dnnExecute_F32((dnnPrimitive_t)primitives[BWD_DATA_INDEX], (void**)resConv),err);

    ((long long*)gradInput)[MKLLayout] = (long long)primitives[L_BD_I];
    ((long long*)gradInput)[MKLPtr]    = (long long)primitives[BUFFER_BWDDATA_INPUT];

ERR_RETURN:
    return;
}
예제 #2
0
파일: conv.c 프로젝트: StevenLOL/neon
void ConvertToMKL(unsigned long long tensor_)
{
    long long * tensor = (long long *)tensor_;
    if (tensor[CPUPtr] == 0 )
    {
        printf("error to conver to MKL tensor!\n");
        return;
    }

	if (tensor[MKLLayout] == 0)    return;//do not need convert

	if (!dnnLayoutCompare_F32((dnnLayout_t)tensor[MKLLayout],
			(dnnLayout_t)tensor[CPULayout]))
	{
		dnnError_t err; dnnPrimitive_t cv;
		CHECK_ERR( dnnConversionCreate_F32(&cv, (dnnLayout_t)tensor[CPULayout],
		            (dnnLayout_t)tensor[MKLLayout]), err );
		CHECK_ERR( dnnConversionExecute_F32(cv, (float *)tensor[CPUPtr], (float *)tensor[MKLPtr]), err );
	}
	else
	{
	    memcpy((void*)tensor[MKLPtr], (void*)tensor[CPUPtr], dnnLayoutGetMemorySize_F32((dnnLayout_t)tensor[MKLLayout]));
    }
ERR_RETURN:
    return;
}
예제 #3
0
파일: conv.c 프로젝트: StevenLOL/neon
//convert tensor in MKL layout back to Numpy NCHW layout
//if layout diff, do conversion, else, copy memory directly
void ConvertBack(unsigned long long tensor_, int N, int C, int H, int W)
{
    long long * tensor = (long long *)tensor_;
    if (tensor[CPUPtr] == 0 )
    {
        printf("error to converback tensor!\n");
        return;
    }

	if (tensor[MKLLayout] == 0)    return;//do not need convert
    dnnError_t err;
	size_t inSize[DIM4]   =   { W, H, C, N};
	size_t inStride[DIM4] =   { 1, W, W*H, W*H*C};
	dnnLayout_t lt_NCHW = NULL, lt_CHWN = NULL;
	float* newPtr = NULL;
	CHECK_ERR( dnnLayoutCreate_F32(&lt_NCHW, DIM4, inSize, inStride),  err );
	if (!dnnLayoutCompare_F32((dnnLayout_t)tensor[MKLLayout], (dnnLayout_t)tensor[CPULayout]))
	{
		float* cpuPtr = (float *)tensor[CPUPtr];
		float* mklPtr = (float *)tensor[MKLPtr];

		if (!dnnLayoutCompare_F32((dnnLayout_t)tensor[MKLLayout], lt_NCHW))
		{
		    dnnPrimitive_t cv;
            CHECK_ERR( dnnConversionCreate_F32(&cv, (dnnLayout_t)tensor[MKLLayout],lt_NCHW), err );
            newPtr = (float*)malloc(N*C*H*W*sizeof(float));
            CHECK_ERR( dnnConversionExecute_F32(cv, mklPtr, newPtr), err );
            mklPtr = newPtr;
		}
        mkl_somatcopy('r', 't', N, C*H*W, 1.0, mklPtr, C*H*W, cpuPtr, N);
	}
	else
	{
	    long long grad_in_len = (long long)dnnLayoutGetMemorySize_F32((dnnLayout_t)tensor[MKLLayout]) ;
        float * destPtr = (float*)tensor[CPUPtr];
        float * srcPtr = (float*)tensor[MKLPtr];
        #pragma omp parallel for
        for (long long i = 0; i < grad_in_len/4; ++i)
        {
            destPtr[i] = srcPtr[i];
        }
    }
ERR_RETURN:
    if (newPtr!=NULL)
 	{
 	    free(newPtr);
 	}
}
예제 #4
0
void MaxPooling_fprop(
    unsigned long long input,
    unsigned long long output,
    unsigned long long dnnprimitives,
    int initOk,
    int useMaxPooling,
    int N, int inC,
    int inH, int inW,
    int kH, int kW,
    int dH, int dW,
    int padH, int padW,
    int outC, int outH, int outW,
    int bCeil)
{
    dnnError_t err;
    long long* primitives = (long long*)dnnprimitives;

     if(initOk == 0)
    {
        Init_f((long long *)input, (long long *)output, primitives,N, inC, inH, inW, kH, kW, dH, dW, padH, padW, outC, outH, outW, useMaxPooling, bCeil);
    }

    //get resource
    float* resPool[dnnResourceNumber] = {0};
     float* input_data = GetPtr(input);
    resPool[dnnResourceSrc]       = input_data;
    resPool[dnnResourceDst]       = (float*)primitives[BUFFER_POOLING_FORWARD_OUTPUT];
    resPool[dnnResourceWorkspace] = (float*)primitives[BUFFER_POOLING_FORWARD_WORKSPACE];

    //do input conversion if necessary
    dnnPrimitive_t cv_in_f     = (dnnPrimitive_t)primitives[CV_POOLING_FORWARD_INPUT];
    if(cv_in_f)
    {
        float* buf_in_f     = (float*) (primitives[BUFFER_POOLING_FORWARD_INPUT]);
        CHECK_ERR( dnnConversionExecute_F32(cv_in_f, input_data, buf_in_f), err );
        resPool[dnnResourceSrc] = buf_in_f;
    }

    CHECK_ERR( dnnExecute_F32((dnnPrimitive_t)primitives[POOLING_FORWARD], (void**)resPool), err );

    ((long long*)output)[MKLPtr]    = primitives[BUFFER_POOLING_FORWARD_OUTPUT];
    ((long long*)output)[MKLLayout] = primitives[POOL_L_F_O];

ERR_RETURN:
    return;
}
예제 #5
0
void MaxPooling_bprop(
    unsigned long long gradOutput,  //input, N*outC*outH*outW
    unsigned long long gradInput,   //output result
    unsigned long long dnnprimitives,
    int initOK, const float beta)
{
    dnnError_t err;
    long long* primitives = (long long*)dnnprimitives;
    if (initOK == 0)
    {
        Init_b((long long *)gradInput, (long long *)gradOutput, primitives);
    }

    //get resource
    float* resPool[dnnResourceNumber] = {0};
    float* OutPtr= GetPtr(gradOutput);

    resPool[dnnResourceDiffSrc]   = (float*)primitives[BUFFER_POOLING_BACKWARD_INPUT];
    resPool[dnnResourceDiffDst]   = OutPtr;
    resPool[dnnResourceWorkspace] = (float*)primitives[BUFFER_POOLING_FORWARD_WORKSPACE];

    //make conversion for gradeOut if necessary
    dnnPrimitive_t cv_out_b = (dnnPrimitive_t)(primitives[CV_POOLING_BACKWARD_OUTPUT]);
    if (cv_out_b)
    {
        float* buf_out_b = (float*)primitives[BUFFER_POOLING_BACKWARD_OUTPUT];
        CHECK_ERR( dnnConversionExecute_F32(cv_out_b, OutPtr, buf_out_b), err );
        resPool[dnnResourceDiffDst] = buf_out_b;
    }

    long long grad_in_len = (long long)dnnLayoutGetMemorySize_F32((dnnLayout_t)primitives[POOL_L_B_I]) ;
    float * tempPtr = (float*)primitives[BUFFER_POOLING_BACKWARD_INPUT];
    #pragma omp parallel for
    for (long long i = 0; i < grad_in_len/4; ++i)
    {
        tempPtr[i] = 0;
    }

    CHECK_ERR( dnnExecute_F32((dnnPrimitive_t)primitives[POOLING_BACKWARD], (void**)resPool), err );

    if(beta != 0.0)
    {
        //require to add previous delta
        long long* ptr_gradInput = (long long*)gradInput;
        float* pFirstBuf = GetPtr(gradInput);
        dnnLayout_t layout_pre_delta = (dnnLayout_t)ptr_gradInput[MKLLayout];
        if(layout_pre_delta == NULL) layout_pre_delta = (dnnLayout_t)primitives[POOL_L_I];
        dnnLayout_t layout_add_delta = (dnnLayout_t)primitives[POOL_L_B_I];
        float* temp_memory = NULL;
        if (!dnnLayoutCompare_F32(layout_add_delta, layout_pre_delta))
        {
            CHECK_ERR( dnnAllocateBuffer_F32((void**)&temp_memory, layout_add_delta) , err );
            dnnPrimitive_t cv = NULL;
            CHECK_ERR( dnnConversionCreate_F32(&cv, layout_pre_delta, layout_add_delta), err );
            CHECK_ERR( dnnConversionExecute_F32(cv, pFirstBuf, temp_memory), err );
            pFirstBuf = temp_memory;
        }
        long len = (long long)dnnLayoutGetMemorySize_F32(layout_add_delta) / 4 ;
        cblas_saxpy(len, 1.0, pFirstBuf, 1, (float*)primitives[BUFFER_POOLING_BACKWARD_INPUT], 1);
        if (temp_memory != NULL)
            dnnReleaseBuffer_F32(temp_memory);
    }

    ((long long *)gradInput)[MKLLayout] = primitives[POOL_L_B_I];
    ((long long *)gradInput)[MKLPtr]    = primitives[BUFFER_POOLING_BACKWARD_INPUT];

ERR_RETURN:
    return;
}
예제 #6
0
void THNN_(BatchNormalization_MKLDNN_backward)(
  THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput,
  THTensor *gradWeight, THTensor *gradBias, THTensor *weight,
  THTensor *running_mean, THTensor *running_var,
  THTensor *save_mean, THTensor *save_std,
  bool train, double scale, double eps,
  THLongTensor *primitives,
          int initOk)
{
  long nInput = THTensor_(size)(input, 1);
  long f,n = THTensor_(nElement)(input) / nInput;
        struct timeval start,mid,end;
        gettimeofday(&start,NULL);    

	dnnError_t err;
	int inC = input->size[1];
	dnnPrimitive_t bn_backward 		= (dnnPrimitive_t)primitives->storage->data[BN_BACKWARD];
	dnnPrimitive_t bn_bwd_scaleshift 	= (dnnPrimitive_t)primitives->storage->data[BN_SCALESHIFT];
	real * buffer_forward_workspace 	= (real * )primitives->storage->data[BUFFER_BN_FORWARD_WORKSPACE];
	real * buffer_forward_scaleshift 	= (real * )primitives->storage->data[BUFFER_BN_FORWARD_SCALESHIFT];


	if(gradInput == 0)
	{
		void* BatchNormScaleshift_res[dnnResourceNumber];
		BatchNormScaleshift_res[dnnResourceSrc] = THTensor_(data)(input);
		BatchNormScaleshift_res[dnnResourceDiffDst] = THTensor_(data)(gradOutput);
		BatchNormScaleshift_res[dnnResourceDiffSrc] = THTensor_(data)(gradInput);
		BatchNormScaleshift_res[dnnResourceWorkspace] = buffer_forward_workspace;
		BatchNormScaleshift_res[dnnResourceScaleShift] = buffer_forward_scaleshift;
		fprintf(stderr, "bn_bwd_scaleshift exec start \n");
		fprintf(stderr, "BatchNormalization_MKLDNN_backward filter, input=0x%x,gradOutput=0x%x,gradInput=0x%x,workspace=0x%x,scaleshift=0x%x \n", THTensor_(data)(input),THTensor_(data)(gradOutput),THTensor_(data)(gradInput),buffer_forward_workspace,buffer_forward_scaleshift);
		CHECK_ERR( dnnExecute_F32(bn_bwd_scaleshift, (void*)BatchNormScaleshift_res), err );
		fprintf(stderr, "bn_bwd_scaleshift exec done \n");
		for(int i=0; i < inC; i++)
		{
			THTensor_(set1d)(gradWeight, i, buffer_forward_scaleshift[i]);
			THTensor_(set1d)(gradBias, i, buffer_forward_scaleshift[i+inC]);
		}
	}else
	{

		if(initOk == 0)
		{
			int N = gradOutput->size[0];
			int outC = gradOutput->size[1];
			int outH = gradOutput->size[2];
			int outW = gradOutput->size[3];

			primitives->storage->data[BN_LAYOUT_OUTPUT] = (long long)gradOutput->mkldnnLayout;
			THNN_(BatchNormalization_MKLDNN_init_backward)(primitives,N,outC,outH,outW,eps);
		}
		dnnPrimitive_t cv_backward_output = (dnnPrimitive_t) (primitives->storage->data[CV_BN_BACKWARD_OUTPUT]);

		real * buffer_backward_output = (real *) (primitives->storage->data[BUFFER_BN_BACKWARD_OUTPUT]);
		real * buffer_backward_input = (real *) (primitives->storage->data[BUFFER_BN_BACKWARD_INPUT]);
/*
		if(gradInput->mkldnnLayout == 0)
		{
			int memSize = gradInput->storage->size;
			THStorage_(free)(gradInput->storage);
			gradInput->storage = THStorage_(newWithData)(buffer_backward_input,memSize);
		}
		gradInput->storage->data = buffer_backward_input;
		gradInput->storageOffset = 0;
*/
		void* BatchNorm_res[dnnResourceNumber];
		BatchNorm_res[dnnResourceSrc] = THTensor_(data)(input);
		BatchNorm_res[dnnResourceDiffDst] = THTensor_(data)(gradOutput);
		BatchNorm_res[dnnResourceDiffSrc] = THTensor_(data)(gradInput);
		BatchNorm_res[dnnResourceWorkspace] = buffer_forward_workspace;
		BatchNorm_res[dnnResourceScaleShift] = buffer_forward_scaleshift;

		if(cv_backward_output)
		{
#if CONVERSION_LOG
			fprintf(stderr, "	BN backward output conversion... \n");
#endif
			BatchNorm_res[dnnResourceDiffDst] = buffer_backward_output;
			CHECK_ERR( dnnConversionExecute_F32(cv_backward_output, THTensor_(data)(gradOutput), BatchNorm_res[dnnResourceDiffDst]), err );

		}
                gettimeofday(&mid,NULL);

		CHECK_ERR( dnnExecute_F32(bn_backward, (void*)BatchNorm_res), err );
		//fprintf(stderr, "bn_backward exec done");
		gradInput->mkldnnLayout = (long long)primitives->storage->data[BN_LAYOUT_BACKWARD_INPUT];
		//gradInput->storageOffset = 0;

	}
 #if LOG_ENABLE
        gettimeofday(&end,NULL);
        double duration1 = (mid.tv_sec - start.tv_sec) * 1000 + (double)(mid.tv_usec - start.tv_usec) /1000;
        double duration2 = (end.tv_sec - mid.tv_sec) * 1000 + (double)(end.tv_usec - mid.tv_usec) /1000;
        fprintf(stderr,"        BatchNorm MKLDNN backward time1 = %.2f ms, time2 = %.2f ms \n",duration1,duration2);
#endif 
}
예제 #7
0
파일: conv.c 프로젝트: StevenLOL/neon
void Conv_bwdFilter(
  unsigned long long input,
  unsigned long long gradOutput,
  unsigned long long gradWeight,
  unsigned long long gradBias,
  unsigned long long dnnprimitives,
  int N, int oC, int oH, int oW,
  int initOk,
  int has_delta)
{
    dnnError_t err;
    long long * primitives = (long long * )dnnprimitives;
	if (initOk == 0)
	{
		Conv_bfilter_init((long long *)input,(long long *)gradOutput,(long long *)gradWeight, primitives, N,  oC, oH, oW);
	}

	float * inPtr     = GetPtr(input);
	float * filterPtr = GetPtr(gradWeight);
	float * outPtr    = GetPtr(gradOutput);

	float * resConv[dnnResourceNumber]={0};
	float * resBias[dnnResourceNumber]={0};
	resConv[dnnResourceDiffFilter] = filterPtr;

    //do input conversion if necessary
    float* newInputPtr = (float*)primitives[BUFFER_TRANS_INPUT];
	if (newInputPtr != NULL)
	{
	    inPtr = newInputPtr;
	}
    resConv[dnnResourceSrc] = inPtr;

	dnnPrimitive_t cv_in_bfilter = (dnnPrimitive_t)primitives[CONVERT_BWDFILTER_INPUT];
	if (cv_in_bfilter)
	{
		float* buf_in_bfilter = (float *)(primitives[BUFFER_BWDFILTER_INPUT]);
		CHECK_ERR( dnnConversionExecute_F32(cv_in_bfilter, inPtr, buf_in_bfilter), err );
	    resConv[dnnResourceSrc] = buf_in_bfilter;
	}

    //for gradout in cpu layout
    float* newGradOutPtr = (float*)primitives[BUFFER_TRANS_OUTPUT];
	if (newGradOutPtr != NULL)
	{
	    if (!has_delta)   //for the first layer without delta
	    {
	    	mkl_somatcopy('r', 't', oC*oH*oW, N, 1.0, outPtr, N, newGradOutPtr, oC*oH*oW);
	    }
	    outPtr = newGradOutPtr;//use transposed NCHW layout
	}
    resConv[dnnResourceDiffDst] = outPtr;

	//do gradOutput conversion if necessary
	dnnPrimitive_t cv_out_bfilter = (dnnPrimitive_t)primitives[CONVERT_BWDFILTER_OUTPUT];
	if (cv_out_bfilter)
	{
        float* buf_out_bfilter = (float *)(primitives[BUFFER_BWDFILTER_OUTPUT]);
		CHECK_ERR( dnnConversionExecute_F32(cv_out_bfilter, outPtr, buf_out_bfilter), err );
	    resConv[dnnResourceDiffDst] = buf_out_bfilter;
	    resBias[dnnResourceDiffDst] = buf_out_bfilter;
	}

	dnnPrimitive_t cv_filter_bfilter = (dnnPrimitive_t)primitives[CONVERT_BWDFILTER_FILTER];
	float* buf_filter_bfilter = (float *)(primitives[BUFFER_BWDFILTER_FILTER]);
	if (cv_filter_bfilter)
	{
	    resConv[dnnResourceDiffFilter] = buf_filter_bfilter;
	}

	CHECK_ERR(dnnExecute_F32((dnnPrimitive_t)primitives[BWD_FILTER_INDEX],
	                        (void**)resConv), err);

    //bias
    if (gradBias != 0)
    {
        float * biasPtr = GetPtr(gradBias);
        dnnPrimitive_t cv_bias_bias = (dnnPrimitive_t)primitives[CV_BIAS_BIAS];
        resBias[dnnResourceDiffBias] = biasPtr;
        if (cv_bias_bias)
        {
            resBias[dnnResourceDiffBias] = (float*)primitives[BUFFER_BIAS_BIAS];
        }

	    resBias[dnnResourceDiffDst] = outPtr;
	    dnnPrimitive_t cv_out_bias = (dnnPrimitive_t)primitives[CV_BIAS_OUT];
	    if (cv_out_bias)
	    {
	     	float* buf_out_bias = (float*)primitives[BUFFER_BIAS_OUT];
	        CHECK_ERR( dnnConversionExecute_F32(cv_out_bias, outPtr, buf_out_bias), err );
	        resBias[dnnResourceDiffDst] = outPtr;
	    }

	    CHECK_ERR(dnnExecute_F32((dnnPrimitive_t)primitives[BDW_BIAS_INDEX], (void**)resBias), err);

        if (cv_bias_bias)
        {
            CHECK_ERR( dnnConversionExecute_F32(cv_bias_bias,resBias[dnnResourceDiffBias], biasPtr), err );
        }
    }

    //do gradWeight conversion if necessary
    if (cv_filter_bfilter)
    {
       CHECK_ERR( dnnConversionExecute_F32(cv_filter_bfilter, buf_filter_bfilter, filterPtr), err );
    }
ERR_RETURN:
    return;
}
예제 #8
0
파일: conv.c 프로젝트: StevenLOL/neon
int Conv_forward(
  unsigned long long input,
  unsigned long long output,
  unsigned long long weight,
  unsigned long long bias,
  unsigned long long dnnprimitives,
  int initOk,
  int N,  int inC, int inH, int inW,
  int kH, int kW,
  int dH,  int dW,
  int padH, int padW,
  int outC, int outH, int outW)
{
	dnnError_t err;
	long long * primitives = (long long*)dnnprimitives;
	if(initOk == 0)
	{
		int hasBias = 1;
        if (bias == 0) hasBias = 0;
		//for the first time, initialize layout and conversion
		int res = Conv_f_init((long long *)input,	(long long *)output, (long long *)weight, primitives,
		            N, inC, inH, inW, kH, kW, dH, dW, padH, padW, outC, outH, outW, hasBias);
	    if(res)
	    {
	        return 1;
	    }
	}

	//get memory as resource
	float* resConv[dnnResourceNumber]={0};
	float* outPtr    = (float*)primitives[BUFFER_FORWARD_OUTPUT];
	float* filterPtr = GetPtr(weight);
    float* biasPtr   = NULL;

	resConv[dnnResourceFilter] = filterPtr;
	resConv[dnnResourceDst]    = outPtr;
	float* inPtr = GetPtr(input);
	if(bias != 0) resConv[dnnResourceBias] = GetPtr(bias);

	//do conversion for input if necessary
	float* newPtr = (float*)primitives[BUFFER_TRANS_INPUT];
	if( newPtr != NULL)
	{
		mkl_somatcopy('r', 't', inC*inH*inW, N, 1.0, inPtr, N, newPtr, inC*inH*inW);
        inPtr = newPtr;
    }
	resConv[dnnResourceSrc]  = inPtr;

	dnnPrimitive_t cv_in_f = (dnnPrimitive_t)primitives[CONVERT_FORWARD_INPUT];
    if(cv_in_f)
	{
		//if no MKL layout, first transpose CHWN into NCHW
		float* buf_in_f  = (float *)(primitives[BUFFER_FORWARD_INPUT]);
		CHECK_ERR( dnnConversionExecute_F32(cv_in_f, inPtr, buf_in_f), err );
	    resConv[dnnResourceSrc] = buf_in_f;
	}

	//do conversion for filter if necessary
	dnnPrimitive_t cv_filter_f = (dnnPrimitive_t)primitives[CONVERT_FORWARD_FILTER];
	if(cv_filter_f)
	{
		float* buf_filter_f = (float *)(primitives[BUFFER_FORWARD_FILTER]);
		CHECK_ERR( dnnConversionExecute_F32(cv_filter_f, filterPtr, buf_filter_f), err );
        resConv[dnnResourceFilter] = buf_filter_f;
    }

    dnnPrimitive_t cv_bias_f  = (dnnPrimitive_t)primitives[CONVERT_FORWARD_BIAS];

    if (cv_bias_f)
    {
        biasPtr = GetPtr(bias);
        float* buf_bias_f = (float *)primitives[BUFFER_FORWARD_BIAS];
        CHECK_ERR( dnnConversionExecute_F32(cv_bias_f, biasPtr, buf_bias_f), err );
        resConv[dnnResourceBias] = buf_bias_f;
    }

    //real execute operation
	CHECK_ERR(dnnExecute_F32((dnnPrimitive_t)primitives[FORWARD_INDEX],(void**)resConv),err);

    //always fill in MKL information for output
	((long long*)output)[MKLPtr]    = primitives[BUFFER_FORWARD_OUTPUT];
    ((long long*)output)[MKLLayout] = (long long)primitives[L_F_O];

    return 0;

ERR_RETURN:
    return 1;
}