bool Model::filter_AVX(const float *packed_input, float *packed_output, cv::Size size) { #ifdef COMPARE_RESULT float *packed_output_cv = (float*)malloc(sizeof(float) * size.width * size.height * nOutputPlanes); double t0 = getsec(); filter_CV(packed_input, packed_output_cv, size); double t1 = getsec(); /* 3x3 = 9 fma */ double ops = size.width * size.height * 9.0 * 2.0 * nOutputPlanes * nInputPlanes; std::vector<cv::Mat> output2; filter_AVX_impl(packed_input, packed_output, nInputPlanes, nOutputPlanes, biases, weights, size, nJob); double t2 = getsec(); printf("%d %d %f %f\n", nInputPlanes, nOutputPlanes, t1-t0, t2-t1); printf("ver2 : %f [Gflops]\n", (ops/(1000.0*1000.0*1000.0)) / (t2-t1)); printf("orig : %f [Gflops]\n", (ops/(1000.0*1000.0*1000.0)) / (t1-t0)); for (int i=0; i<size.width * size.height * nOutputPlanes; i++) { float v0 = packed_output_cv[i]; float v1 = packed_output[i]; float d = fabs(v0 - v1); float r0 = d/fabs(v0); float r1 = d/fabs(v1); float r = std::max(r0, r1); if (r > 0.1f && d > 0.0000001f) { printf("d=%.20f %.20f %.20f @ \n",r, v0, v1, i); exit(1); } } #else //double t1 = getsec(); filter_AVX_impl(packed_input, packed_output, nInputPlanes, nOutputPlanes, biases, weights, size, nJob); //double t2 = getsec(); //double ops = size.width * size.height * 9.0 * 2.0 * nOutputPlanes * nInputPlanes; //printf("ver2 : %f [Gflops], %f[msec]\n", (ops/(1000.0*1000.0*1000.0)) / (t2-t1), (t2-t1)*1000); #endif return true; }
bool Model::filter_AVX_OpenCL(ComputeEnv *env, Buffer *packed_input_buf, Buffer *packed_output_buf, cv::Size size, enum runtype rt) { int vec_width; int weight_step; unsigned int eax=0, ebx=0, ecx=0, edx=0; bool have_fma = false, have_avx = false; int nJob = modelUtility::getInstance().getNumberOfJobs(); #ifdef __GNUC__ __get_cpuid(1, &eax, &ebx, &ecx, &edx); #else int cpuInfo[4]; __cpuid(cpuInfo, 1); eax = cpuInfo[0]; ebx = cpuInfo[1]; ecx = cpuInfo[2]; edx = cpuInfo[3]; #endif if ((ecx & 0x18000000) == 0x18000000) { have_avx = true; } if (ecx & (1<<12)) { have_fma = true; } bool gpu = (rt == RUN_OPENCL) || (rt == RUN_CUDA); if (gpu) { weight_step = GPU_VEC_WIDTH; vec_width = GPU_VEC_WIDTH; } else { weight_step = nOutputPlanes; vec_width = VEC_WIDTH; } float *weight_flat = (float*)_mm_malloc(sizeof(float)*nInputPlanes*weight_step*3*3, 64); float *fbiases_flat = (float*)_mm_malloc(sizeof(float) * biases.size(), 64); for (int i=0; i<(int)biases.size(); i++) { fbiases_flat[i] = biases[i]; } if (nOutputPlanes == 1) { if (gpu) { for (int ii=0; ii<nInputPlanes; ii++) { cv::Mat &wm = weights[ii]; const float *src0 = (float*)wm.ptr(0); const float *src1 = (float*)wm.ptr(1); const float *src2 = (float*)wm.ptr(2); float *dst = weight_flat + ii * 9; dst[0] = src0[0]; dst[1] = src0[1]; dst[2] = src0[2]; dst[3] = src1[0]; dst[4] = src1[1]; dst[5] = src1[2]; dst[6] = src2[0]; dst[7] = src2[1]; dst[8] = src2[2]; } } else { for (int ii=0; ii<nInputPlanes; ii++) { cv::Mat &wm = weights[ii]; const float *src0 = (float*)wm.ptr(0); const float *src1 = (float*)wm.ptr(1); const float *src2 = (float*)wm.ptr(2); int ii_0 = ii % vec_width; int ii_1 = (ii / vec_width) * vec_width; float *dst = weight_flat + ii_1 * 9 + ii_0; dst[0 * vec_width] = src0[0]; dst[1 * vec_width] = src0[1]; dst[2 * vec_width] = src0[2]; dst[3 * vec_width] = src1[0]; dst[4 * vec_width] = src1[1]; dst[5 * vec_width] = src1[2]; dst[6 * vec_width] = src2[0]; dst[7 * vec_width] = src2[1]; dst[8 * vec_width] = src2[2]; } } } else if (gpu && nInputPlanes == 1) { for (int oi=0; oi<nOutputPlanes; oi++) { cv::Mat &wm = weights[oi]; const float *src0 = (float*)wm.ptr(0); const float *src1 = (float*)wm.ptr(1); const float *src2 = (float*)wm.ptr(2); float *dst = weight_flat + oi * 9; dst[0] = src0[0]; dst[1] = src0[1]; dst[2] = src0[2]; dst[3] = src1[0]; dst[4] = src1[1]; dst[5] = src1[2]; dst[6] = src2[0]; dst[7] = src2[1]; dst[8] = src2[2]; } } else if (nOutputPlanes == 3) { /* | o0 | o1 | o2 ... | * |i0 i1 i2 ... i127|i0 i1 i2 ... i127| ... |*/ for (int oi=0; oi<nOutputPlanes; oi++) { for (int ii=0; ii<nInputPlanes; ii++) { int mi = oi*nInputPlanes+ii; cv::Mat &wm = weights[mi]; const float *src0 = (float*)wm.ptr(0); const float *src1 = (float*)wm.ptr(1); const float *src2 = (float*)wm.ptr(2); float *dst = weight_flat + (oi * nInputPlanes * 9) + ii; dst[0*nInputPlanes] = src0[0]; dst[1*nInputPlanes] = src0[1]; dst[2*nInputPlanes] = src0[2]; dst[3*nInputPlanes] = src1[0]; dst[4*nInputPlanes] = src1[1]; dst[5*nInputPlanes] = src1[2]; dst[6*nInputPlanes] = src2[0]; dst[7*nInputPlanes] = src2[1]; dst[8*nInputPlanes] = src2[2]; } } } else if (gpu && (nInputPlanes == 3) && (nOutputPlanes == 32)) { /* | i0 | i1 | i2 .. iN-1| * |o0 o1 o2 o3..o31|o0 .... o32| .... | * |<- ->| * | 32 | * | x 9 | */ for (int oi=0; oi<nOutputPlanes; oi++) { for (int ii=0; ii<nInputPlanes; ii++) { int mi = oi*nInputPlanes+ii; cv::Mat &wm = weights[mi]; const float *src0 = (float*)wm.ptr(0); const float *src1 = (float*)wm.ptr(1); const float *src2 = (float*)wm.ptr(2); float *dst = weight_flat + (ii * nOutputPlanes * 9) + oi; dst[0*nOutputPlanes] = src0[0]; dst[1*nOutputPlanes] = src0[1]; dst[2*nOutputPlanes] = src0[2]; dst[3*nOutputPlanes] = src1[0]; dst[4*nOutputPlanes] = src1[1]; dst[5*nOutputPlanes] = src1[2]; dst[6*nOutputPlanes] = src2[0]; dst[7*nOutputPlanes] = src2[1]; dst[8*nOutputPlanes] = src2[2]; } } } else { /* | i0 | i1 | i2 .. iN-1| i0 | i1 | .. * |o0 o1 o2 o3|o0 o1 o2 o3| .... |o4 o5 o6 o7|o4 o5 o6 o7| .. * |<- ->| * | VEC_WIDTH | * | x 9 | */ for (int oi=0; oi<nOutputPlanes; oi++) { for (int ii=0; ii<nInputPlanes; ii++) { int mi = oi*nInputPlanes+ii; cv::Mat &wm = weights[mi]; const float *src0 = (float*)wm.ptr(0); const float *src1 = (float*)wm.ptr(1); const float *src2 = (float*)wm.ptr(2); int oi_0 = oi % vec_width; int oi_1 = (oi / vec_width) * vec_width; float *dst = weight_flat + ((ii*weight_step + oi_1) * 9) + oi_0; dst[0*vec_width] = src0[0]; dst[1*vec_width] = src0[1]; dst[2*vec_width] = src0[2]; dst[3*vec_width] = src1[0]; dst[4*vec_width] = src1[1]; dst[5*vec_width] = src1[2]; dst[6*vec_width] = src2[0]; dst[7*vec_width] = src2[1]; dst[8*vec_width] = src2[2]; } } } bool compare_result = false; #ifdef COMPARE_RESULT if (nOutputPlanes == 3) { compare_result = true; } #endif size_t in_size = size.width * size.height * sizeof(float) * nInputPlanes; size_t out_size = size.width * size.height * sizeof(float) * nOutputPlanes; if (compare_result) { Buffer *packed_output_cv_buf = new Buffer(env, sizeof(float) * size.width * size.height * nOutputPlanes); double t0 = getsec(); filter_CV(env, packed_input_buf, packed_output_cv_buf, size); //filter_FMA_impl(packed_input, packed_output_cv, // nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat, size, nJob); double t1 = getsec(); /* 3x3 = 9 fma */ double ops = size.width * size.height * 9.0 * 2.0 * nOutputPlanes * nInputPlanes; std::vector<cv::Mat> output2; if (rt == RUN_OPENCL) { filter_OpenCL_impl(env, packed_input_buf, packed_output_buf, nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat, size.width, size.height, nJob); } else if (rt == RUN_CUDA) { filter_CUDA_impl(env, packed_input_buf, packed_output_buf, nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat, size.width, size.height, nJob); } else { const float *packed_input = (float*)packed_input_buf->get_read_ptr_host(env, in_size); float *packed_output = (float*)packed_output_buf->get_write_ptr_host(env); if (have_fma) { filter_FMA_impl(env, packed_input, packed_output, nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat, size.width, size.height, nJob); } else { filter_AVX_impl(env, packed_input, packed_output, nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat, size.width, size.height, nJob); } } double t2 = getsec(); printf("(w=%d,h=%d) (ip=%d,op=%d) %f %f %f[gflops]\n", size.width, size.height, nInputPlanes, nOutputPlanes, t1-t0, t2-t1, ops/(1000*1000*1000)); printf("ver2 : %f [Gflops]\n", (ops/(1000.0*1000.0*1000.0)) / (t2-t1)); printf("orig : %f [Gflops]\n", (ops/(1000.0*1000.0*1000.0)) / (t1-t0)); int error_count = 0; float *packed_output_cv = (float*)packed_output_cv_buf->get_read_ptr_host(env, out_size); float *packed_output = (float*)packed_output_buf->get_read_ptr_host(env, out_size); for (int i=0; i<size.width * size.height * nOutputPlanes; i++) { float v0 = packed_output_cv[i]; float v1 = packed_output[i]; float d = fabs(v0 - v1); float r0 = d/fabs(v0); float r1 = d/fabs(v1); float r = (std::max)(r0, r1); if (r > 0.1f && d > 0.000001f) { int plane = i % nOutputPlanes; int pixpos = i / nOutputPlanes; int xpos = pixpos % size.width; int ypos = pixpos / size.width; printf("d=%.20f %.20f %.20f @ (%d,%d,%d,%d) \n",r, v0, v1, xpos, ypos, plane, i); error_count++; if (error_count >= 256) { exit(1); } } } if (error_count != 0) { exit(1); } delete packed_output_cv_buf; } else { if (rt == RUN_OPENCL) { filter_OpenCL_impl(env, packed_input_buf, packed_output_buf, nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat, size.width, size.height, nJob); } else if (rt == RUN_CUDA) { filter_CUDA_impl(env, packed_input_buf, packed_output_buf, nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat, size.width, size.height, nJob); } else { if (!have_avx) { filter_CV(env, packed_input_buf, packed_output_buf, size); } else { const float *packed_input = (float*)packed_input_buf->get_read_ptr_host(env, in_size); float *packed_output = (float*)packed_output_buf->get_write_ptr_host(env); if (have_fma) { filter_FMA_impl(env, packed_input, packed_output, nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat, size.width, size.height, nJob); } else if (have_avx) { filter_AVX_impl(env, packed_input, packed_output, nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat, size.width, size.height, nJob); } } } } _mm_free(fbiases_flat); _mm_free(weight_flat); return true; }