//gradOutput, conv output gradient, also as the input //gradInput, to be calculated void Conv_bwdData( unsigned long long gradOutput, unsigned long long gradInput, unsigned long long weight, unsigned long long dnnprimitives, int N, int oC, int oH, int oW, int initOk, float beta) { dnnError_t err; long long * primitives = (long long *)dnnprimitives; if(initOk == 0) { Conv_bdata_init((long long *)gradInput, (long long *)gradOutput, N, oC, oH, oW, (long long *)weight, primitives); } //get resource float* inPtr = (float*)primitives[BUFFER_BWDDATA_INPUT]; float* outPtr = GetPtr(gradOutput); float* filterPtr = GetPtr(weight); float * resConv[dnnResourceNumber]={0}; resConv[dnnResourceDiffSrc] = inPtr; resConv[dnnResourceFilter] = filterPtr; //do transpose if necessary float* newPtr = (float*)primitives[BUFFER_TRANS_OUTPUT]; if (newPtr!=NULL) { mkl_somatcopy('r', 't', oC*oH*oW, N, 1.0, outPtr, N, newPtr, oC*oH*oW); outPtr = newPtr; } resConv[dnnResourceDiffDst] = outPtr; //do conversion if necessary dnnPrimitive_t cv_out_bdata = (dnnPrimitive_t)primitives[CONVERT_BWDDATA_OUTPUT]; if (cv_out_bdata) { float* buf_out_bdata = (float *)(primitives[BUFFER_BWDDATA_OUTPUT]); CHECK_ERR( dnnConversionExecute_F32(cv_out_bdata, outPtr, buf_out_bdata), err ); resConv[dnnResourceDiffDst] = buf_out_bdata; } dnnPrimitive_t cv_filter_bdata = (dnnPrimitive_t)primitives[CONVERT_BWDDATA_FILTER]; if (cv_filter_bdata) { float* buf_filter_bdata = (float *)(primitives[BUFFER_BWDDATA_FILTER]); CHECK_ERR( dnnConversionExecute_F32(cv_filter_bdata, filterPtr, buf_filter_bdata), err ); resConv[dnnResourceFilter] = buf_filter_bdata; } CHECK_ERR(dnnExecute_F32((dnnPrimitive_t)primitives[BWD_DATA_INDEX], (void**)resConv),err); ((long long*)gradInput)[MKLLayout] = (long long)primitives[L_BD_I]; ((long long*)gradInput)[MKLPtr] = (long long)primitives[BUFFER_BWDDATA_INPUT]; ERR_RETURN: return; }
void MatTrans( unsigned long long input, unsigned long long output, unsigned long long m, unsigned long long n) { mkl_somatcopy('r', 't', m, n, 1.0, (float*)input, n, (float*)output, m); }
JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_spermute (JNIEnv * env, jobject calling_obj, jint M, jint N, jint K, jfloatArray j_A, jfloatArray j_B) { int i, offset, step = M*N; jfloat * A = (*env)->GetPrimitiveArrayCritical(env, j_A, JNI_FALSE); jfloat * B = (*env)->GetPrimitiveArrayCritical(env, j_B, JNI_FALSE); for (i = 0, offset = 0; i < K; i++, offset += step) { mkl_somatcopy('C', 'T', M, N, 1.0f, A+offset, M, B+offset, N); } (*env)->ReleasePrimitiveArrayCritical(env, j_B, B, 0); (*env)->ReleasePrimitiveArrayCritical(env, j_A, A, 0); }
JNIEXPORT void JNICALL Java_edu_berkeley_bid_CBLAS_iomatcopy (JNIEnv * env, jobject calling_obj, jstring j_order, jstring j_transA, jint M, jint N, jintArray j_A, jint lda, jintArray j_B, jint ldb) { char * order = (char *)(*env)->GetStringUTFChars(env, j_order, 0); char * transA = (char *)(*env)->GetStringUTFChars(env, j_transA, 0); jfloat * A = (*env)->GetPrimitiveArrayCritical(env, j_A, JNI_FALSE); jfloat * B = (*env)->GetPrimitiveArrayCritical(env, j_B, JNI_FALSE); mkl_somatcopy(order[0], transA[0], M, N, 1.0f, A, lda, B, ldb); (*env)->ReleasePrimitiveArrayCritical(env, j_B, B, 0); (*env)->ReleasePrimitiveArrayCritical(env, j_A, A, 0); (*env)->ReleaseStringUTFChars(env, j_transA, transA); (*env)->ReleaseStringUTFChars(env, j_order, order); }
//convert tensor in MKL layout back to Numpy NCHW layout //if layout diff, do conversion, else, copy memory directly void ConvertBack(unsigned long long tensor_, int N, int C, int H, int W) { long long * tensor = (long long *)tensor_; if (tensor[CPUPtr] == 0 ) { printf("error to converback tensor!\n"); return; } if (tensor[MKLLayout] == 0) return;//do not need convert dnnError_t err; size_t inSize[DIM4] = { W, H, C, N}; size_t inStride[DIM4] = { 1, W, W*H, W*H*C}; dnnLayout_t lt_NCHW = NULL, lt_CHWN = NULL; float* newPtr = NULL; CHECK_ERR( dnnLayoutCreate_F32(<_NCHW, DIM4, inSize, inStride), err ); if (!dnnLayoutCompare_F32((dnnLayout_t)tensor[MKLLayout], (dnnLayout_t)tensor[CPULayout])) { float* cpuPtr = (float *)tensor[CPUPtr]; float* mklPtr = (float *)tensor[MKLPtr]; if (!dnnLayoutCompare_F32((dnnLayout_t)tensor[MKLLayout], lt_NCHW)) { dnnPrimitive_t cv; CHECK_ERR( dnnConversionCreate_F32(&cv, (dnnLayout_t)tensor[MKLLayout],lt_NCHW), err ); newPtr = (float*)malloc(N*C*H*W*sizeof(float)); CHECK_ERR( dnnConversionExecute_F32(cv, mklPtr, newPtr), err ); mklPtr = newPtr; } mkl_somatcopy('r', 't', N, C*H*W, 1.0, mklPtr, C*H*W, cpuPtr, N); } else { long long grad_in_len = (long long)dnnLayoutGetMemorySize_F32((dnnLayout_t)tensor[MKLLayout]) ; float * destPtr = (float*)tensor[CPUPtr]; float * srcPtr = (float*)tensor[MKLPtr]; #pragma omp parallel for for (long long i = 0; i < grad_in_len/4; ++i) { destPtr[i] = srcPtr[i]; } } ERR_RETURN: if (newPtr!=NULL) { free(newPtr); } }
long detection_fprop( float* conf, //score for each class for each box, num_box * num_class * bs float* loc, //location for each box, box * 4 * bs float* res_detection, //final memory restoring boxes, bs * top_k float* prior_boxes, //num_boxes * 4 long * res_batch_len, //record count of result for each batch, bs const long num_boxes, //num_boxes, each is a potential object const long num_class, //number of class const long bs, //batch size const long nms_topk, //first top k box for nms result for each class const long image_topk, //first top k box for input image const float score_threshold, //threshold for accepting as a object for box const float nms_threshold) //threshold for two overlapped boxes, too overlapped is one object { //sorted result of index long* index_batch = malloc(bs*num_boxes*num_class*sizeof(long)); //scores to be sorted float* scores_batch = malloc(bs*num_boxes*num_class*sizeof(float)); //temp result detections for each batch, grow when iterating among classes float* temp_res_detection_batch = malloc(bs*num_class*nms_topk*6*sizeof(float)); //internal memory to restore sorted boxes for each class float* internal_detection_batch = malloc(bs*nms_topk*5*sizeof(float)); //internal memory to restore transformed location float* proposal_batch = malloc(bs*num_boxes*4*sizeof(float)); //transpose KLN to NKL float* conf_t = malloc(num_boxes * num_class * bs * sizeof(float)); float* loc_t = malloc(num_boxes * 4* bs * sizeof(float)); mkl_somatcopy('r', 't', num_boxes*num_class, bs, 1.0, conf, bs, conf_t, num_boxes*num_class); mkl_somatcopy('r', 't', num_boxes*4, bs, 1.0, loc, bs, loc_t, num_boxes*4); //loop for batch size #pragma omp parallel for for(long b=0; b<bs; ++b) //loop for batch { float* scores = scores_batch + b * num_boxes*num_class; float* temp_res_detection = temp_res_detection_batch + b * num_class*nms_topk*6; long* index = index_batch + b * num_boxes*num_class; float* internal_detection = internal_detection_batch + b * nms_topk*5; float* proposal = proposal_batch + b * num_boxes*4; //calculate class scores for this batch using softmax float* conf_batch = conf_t + b * num_boxes * num_class; softmax(conf_batch, num_boxes, num_class); //store scores in an array mkl_somatcopy('r', 't', num_boxes, num_class, 1.0, conf_batch, num_class, scores, num_boxes); //transform locations in proposal bbox_transform_inv(prior_boxes, loc_t + b * 4 * num_boxes, proposal, num_boxes); long res_len = 0; //count of feasible boxes for this image for(long c=1; c<num_class; ++c) //loop for classes { //for each class, sort out first nms_topk boxes, store result in index long sort_nums_res = get_top_N_index(scores + c*num_boxes, nms_topk, num_boxes, score_threshold, index); //store location and score for the sorted results if(sort_nums_res > 0) { //store location and score in internal_detection for overlapped check for(long i=0; i<sort_nums_res; ++i) { for(long j=0; j<4; ++j) internal_detection[i*5+j] = proposal[index[i]*4+j]; internal_detection[i*5+4] = scores[c*num_boxes+i]; } //remove overlapped box sort_nums_res = nms(internal_detection, index, nms_threshold, 1, sort_nums_res); //store result in temp memory and add class number, thus width is 6 for(long i=0; i<sort_nums_res; ++i) { float* temp = temp_res_detection + (res_len+i)*6; for(long j=0; j<5; ++j) { temp[j] = internal_detection[index[i]*5+j]; } //add class number temp[5] = c; } res_len += sort_nums_res; } } //sort out first top_k boxes for this image for(long i=0; i<res_len; ++i) { scores[i] = temp_res_detection[i*6+4]; index[i] = i; } long sort_nums_res = res_len; if(sort_nums_res>image_topk) //sort first top_k out of res_len { sort_nums_res = get_top_N_index(scores, image_topk, res_len, 0.0, index); } //store sorted result in final output float* temp = res_detection + b * image_topk * 6; for(long i=0; i<sort_nums_res; ++i) { for(long j=0; j<6; ++j) { temp[i*6+j] = temp_res_detection[index[i]*6+j]; } } res_batch_len[b] = sort_nums_res; } free(conf_t); free(loc_t); free(index_batch); free(scores_batch); free(temp_res_detection_batch); free(proposal_batch); free(internal_detection_batch); }
void Conv_bwdFilter( unsigned long long input, unsigned long long gradOutput, unsigned long long gradWeight, unsigned long long gradBias, unsigned long long dnnprimitives, int N, int oC, int oH, int oW, int initOk, int has_delta) { dnnError_t err; long long * primitives = (long long * )dnnprimitives; if (initOk == 0) { Conv_bfilter_init((long long *)input,(long long *)gradOutput,(long long *)gradWeight, primitives, N, oC, oH, oW); } float * inPtr = GetPtr(input); float * filterPtr = GetPtr(gradWeight); float * outPtr = GetPtr(gradOutput); float * resConv[dnnResourceNumber]={0}; float * resBias[dnnResourceNumber]={0}; resConv[dnnResourceDiffFilter] = filterPtr; //do input conversion if necessary float* newInputPtr = (float*)primitives[BUFFER_TRANS_INPUT]; if (newInputPtr != NULL) { inPtr = newInputPtr; } resConv[dnnResourceSrc] = inPtr; dnnPrimitive_t cv_in_bfilter = (dnnPrimitive_t)primitives[CONVERT_BWDFILTER_INPUT]; if (cv_in_bfilter) { float* buf_in_bfilter = (float *)(primitives[BUFFER_BWDFILTER_INPUT]); CHECK_ERR( dnnConversionExecute_F32(cv_in_bfilter, inPtr, buf_in_bfilter), err ); resConv[dnnResourceSrc] = buf_in_bfilter; } //for gradout in cpu layout float* newGradOutPtr = (float*)primitives[BUFFER_TRANS_OUTPUT]; if (newGradOutPtr != NULL) { if (!has_delta) //for the first layer without delta { mkl_somatcopy('r', 't', oC*oH*oW, N, 1.0, outPtr, N, newGradOutPtr, oC*oH*oW); } outPtr = newGradOutPtr;//use transposed NCHW layout } resConv[dnnResourceDiffDst] = outPtr; //do gradOutput conversion if necessary dnnPrimitive_t cv_out_bfilter = (dnnPrimitive_t)primitives[CONVERT_BWDFILTER_OUTPUT]; if (cv_out_bfilter) { float* buf_out_bfilter = (float *)(primitives[BUFFER_BWDFILTER_OUTPUT]); CHECK_ERR( dnnConversionExecute_F32(cv_out_bfilter, outPtr, buf_out_bfilter), err ); resConv[dnnResourceDiffDst] = buf_out_bfilter; resBias[dnnResourceDiffDst] = buf_out_bfilter; } dnnPrimitive_t cv_filter_bfilter = (dnnPrimitive_t)primitives[CONVERT_BWDFILTER_FILTER]; float* buf_filter_bfilter = (float *)(primitives[BUFFER_BWDFILTER_FILTER]); if (cv_filter_bfilter) { resConv[dnnResourceDiffFilter] = buf_filter_bfilter; } CHECK_ERR(dnnExecute_F32((dnnPrimitive_t)primitives[BWD_FILTER_INDEX], (void**)resConv), err); //bias if (gradBias != 0) { float * biasPtr = GetPtr(gradBias); dnnPrimitive_t cv_bias_bias = (dnnPrimitive_t)primitives[CV_BIAS_BIAS]; resBias[dnnResourceDiffBias] = biasPtr; if (cv_bias_bias) { resBias[dnnResourceDiffBias] = (float*)primitives[BUFFER_BIAS_BIAS]; } resBias[dnnResourceDiffDst] = outPtr; dnnPrimitive_t cv_out_bias = (dnnPrimitive_t)primitives[CV_BIAS_OUT]; if (cv_out_bias) { float* buf_out_bias = (float*)primitives[BUFFER_BIAS_OUT]; CHECK_ERR( dnnConversionExecute_F32(cv_out_bias, outPtr, buf_out_bias), err ); resBias[dnnResourceDiffDst] = outPtr; } CHECK_ERR(dnnExecute_F32((dnnPrimitive_t)primitives[BDW_BIAS_INDEX], (void**)resBias), err); if (cv_bias_bias) { CHECK_ERR( dnnConversionExecute_F32(cv_bias_bias,resBias[dnnResourceDiffBias], biasPtr), err ); } } //do gradWeight conversion if necessary if (cv_filter_bfilter) { CHECK_ERR( dnnConversionExecute_F32(cv_filter_bfilter, buf_filter_bfilter, filterPtr), err ); } ERR_RETURN: return; }
int Conv_forward( unsigned long long input, unsigned long long output, unsigned long long weight, unsigned long long bias, unsigned long long dnnprimitives, int initOk, int N, int inC, int inH, int inW, int kH, int kW, int dH, int dW, int padH, int padW, int outC, int outH, int outW) { dnnError_t err; long long * primitives = (long long*)dnnprimitives; if(initOk == 0) { int hasBias = 1; if (bias == 0) hasBias = 0; //for the first time, initialize layout and conversion int res = Conv_f_init((long long *)input, (long long *)output, (long long *)weight, primitives, N, inC, inH, inW, kH, kW, dH, dW, padH, padW, outC, outH, outW, hasBias); if(res) { return 1; } } //get memory as resource float* resConv[dnnResourceNumber]={0}; float* outPtr = (float*)primitives[BUFFER_FORWARD_OUTPUT]; float* filterPtr = GetPtr(weight); float* biasPtr = NULL; resConv[dnnResourceFilter] = filterPtr; resConv[dnnResourceDst] = outPtr; float* inPtr = GetPtr(input); if(bias != 0) resConv[dnnResourceBias] = GetPtr(bias); //do conversion for input if necessary float* newPtr = (float*)primitives[BUFFER_TRANS_INPUT]; if( newPtr != NULL) { mkl_somatcopy('r', 't', inC*inH*inW, N, 1.0, inPtr, N, newPtr, inC*inH*inW); inPtr = newPtr; } resConv[dnnResourceSrc] = inPtr; dnnPrimitive_t cv_in_f = (dnnPrimitive_t)primitives[CONVERT_FORWARD_INPUT]; if(cv_in_f) { //if no MKL layout, first transpose CHWN into NCHW float* buf_in_f = (float *)(primitives[BUFFER_FORWARD_INPUT]); CHECK_ERR( dnnConversionExecute_F32(cv_in_f, inPtr, buf_in_f), err ); resConv[dnnResourceSrc] = buf_in_f; } //do conversion for filter if necessary dnnPrimitive_t cv_filter_f = (dnnPrimitive_t)primitives[CONVERT_FORWARD_FILTER]; if(cv_filter_f) { float* buf_filter_f = (float *)(primitives[BUFFER_FORWARD_FILTER]); CHECK_ERR( dnnConversionExecute_F32(cv_filter_f, filterPtr, buf_filter_f), err ); resConv[dnnResourceFilter] = buf_filter_f; } dnnPrimitive_t cv_bias_f = (dnnPrimitive_t)primitives[CONVERT_FORWARD_BIAS]; if (cv_bias_f) { biasPtr = GetPtr(bias); float* buf_bias_f = (float *)primitives[BUFFER_FORWARD_BIAS]; CHECK_ERR( dnnConversionExecute_F32(cv_bias_f, biasPtr, buf_bias_f), err ); resConv[dnnResourceBias] = buf_bias_f; } //real execute operation CHECK_ERR(dnnExecute_F32((dnnPrimitive_t)primitives[FORWARD_INDEX],(void**)resConv),err); //always fill in MKL information for output ((long long*)output)[MKLPtr] = primitives[BUFFER_FORWARD_OUTPUT]; ((long long*)output)[MKLLayout] = (long long)primitives[L_F_O]; return 0; ERR_RETURN: return 1; }