bool Model::filter_AVX(const float *packed_input,
		       float *packed_output,
		       cv::Size size)
{
#ifdef COMPARE_RESULT
	float *packed_output_cv = (float*)malloc(sizeof(float) * size.width * size.height * nOutputPlanes);

	double t0 = getsec();
	filter_CV(packed_input, packed_output_cv, size);
	double t1 = getsec();

	/* 3x3 = 9 fma */
	double ops = size.width * size.height * 9.0 * 2.0 * nOutputPlanes * nInputPlanes;
	std::vector<cv::Mat> output2;
	filter_AVX_impl(packed_input, packed_output,
			nInputPlanes, nOutputPlanes, biases, weights, size, nJob);
	double t2 = getsec();

	printf("%d %d %f %f\n", nInputPlanes, nOutputPlanes, t1-t0, t2-t1);
	printf("ver2 : %f [Gflops]\n", (ops/(1000.0*1000.0*1000.0)) / (t2-t1));
	printf("orig : %f [Gflops]\n", (ops/(1000.0*1000.0*1000.0)) / (t1-t0));

	for (int i=0; i<size.width * size.height * nOutputPlanes; i++) {
		float v0 = packed_output_cv[i];
		float v1 = packed_output[i];
		float d = fabs(v0 - v1);

		float r0 = d/fabs(v0);
		float r1 = d/fabs(v1);

		float r = std::max(r0, r1);

		if (r > 0.1f && d > 0.0000001f) {
			printf("d=%.20f %.20f %.20f @ \n",r, v0, v1, i);
			exit(1);
		}

	}
#else
	//double t1 = getsec();
	filter_AVX_impl(packed_input, packed_output,
			nInputPlanes, nOutputPlanes, biases, weights, size, nJob);
	//double t2 = getsec();
	//double ops = size.width * size.height * 9.0 * 2.0 * nOutputPlanes * nInputPlanes;
	//printf("ver2 : %f [Gflops], %f[msec]\n", (ops/(1000.0*1000.0*1000.0)) / (t2-t1), (t2-t1)*1000);
#endif

	return true;

}
bool Model::filter_AVX_OpenCL(ComputeEnv *env,
			      Buffer *packed_input_buf,
			      Buffer *packed_output_buf,
			      cv::Size size,
			      enum runtype rt)
{
	int vec_width;
	int weight_step;
	unsigned int eax=0, ebx=0, ecx=0, edx=0;
	bool have_fma = false, have_avx = false;
	int nJob = modelUtility::getInstance().getNumberOfJobs();

#ifdef __GNUC__
	__get_cpuid(1, &eax, &ebx, &ecx, &edx);
#else
	int cpuInfo[4];
	__cpuid(cpuInfo, 1);
	eax = cpuInfo[0];
	ebx = cpuInfo[1];
	ecx = cpuInfo[2];
	edx = cpuInfo[3];
#endif
	if ((ecx & 0x18000000) == 0x18000000) {
		have_avx = true;
	}

	if (ecx & (1<<12)) {
		have_fma = true;
	}

	bool gpu = (rt == RUN_OPENCL) || (rt == RUN_CUDA);

	if (gpu) {
		weight_step = GPU_VEC_WIDTH;
		vec_width = GPU_VEC_WIDTH;
	} else {
		weight_step = nOutputPlanes;
		vec_width = VEC_WIDTH;
	}

	float *weight_flat = (float*)_mm_malloc(sizeof(float)*nInputPlanes*weight_step*3*3, 64);
	float *fbiases_flat = (float*)_mm_malloc(sizeof(float) * biases.size(), 64);

	for (int i=0; i<(int)biases.size(); i++) {
		fbiases_flat[i] = biases[i];
	}

	if (nOutputPlanes == 1) {
		if (gpu) {
			for (int ii=0; ii<nInputPlanes; ii++) {
				cv::Mat &wm = weights[ii];
				const float *src0 = (float*)wm.ptr(0);
				const float *src1 = (float*)wm.ptr(1);
				const float *src2 = (float*)wm.ptr(2);

				float *dst = weight_flat + ii * 9;
				dst[0] = src0[0];
				dst[1] = src0[1];
				dst[2] = src0[2];

				dst[3] = src1[0];
				dst[4] = src1[1];
				dst[5] = src1[2];

				dst[6] = src2[0];
				dst[7] = src2[1];
				dst[8] = src2[2];

			}
		} else {
			for (int ii=0; ii<nInputPlanes; ii++) {
				cv::Mat &wm = weights[ii];
				const float *src0 = (float*)wm.ptr(0);
				const float *src1 = (float*)wm.ptr(1);
				const float *src2 = (float*)wm.ptr(2);

				int ii_0 = ii % vec_width;
				int ii_1 = (ii / vec_width) * vec_width;

				float *dst = weight_flat + ii_1 * 9  + ii_0;
				dst[0 * vec_width] = src0[0];
				dst[1 * vec_width] = src0[1];
				dst[2 * vec_width] = src0[2];

				dst[3 * vec_width] = src1[0];
				dst[4 * vec_width] = src1[1];
				dst[5 * vec_width] = src1[2];

				dst[6 * vec_width] = src2[0];
				dst[7 * vec_width] = src2[1];
				dst[8 * vec_width] = src2[2];
			}
		}
	} else if (gpu && nInputPlanes == 1) {
		for (int oi=0; oi<nOutputPlanes; oi++) {
			cv::Mat &wm = weights[oi];
			const float *src0 = (float*)wm.ptr(0);
			const float *src1 = (float*)wm.ptr(1);
			const float *src2 = (float*)wm.ptr(2);

			float *dst = weight_flat + oi * 9;
			dst[0] = src0[0];
			dst[1] = src0[1];
			dst[2] = src0[2];

			dst[3] = src1[0];
			dst[4] = src1[1];
			dst[5] = src1[2];

			dst[6] = src2[0];
			dst[7] = src2[1];
			dst[8] = src2[2];
		}
	} else if (nOutputPlanes == 3) {
		/* |       o0        |       o1        | o2 ... |
		 * |i0 i1 i2 ... i127|i0 i1 i2 ... i127| ...    |*/

		for (int oi=0; oi<nOutputPlanes; oi++) {
			for (int ii=0; ii<nInputPlanes; ii++) {
				int mi = oi*nInputPlanes+ii;
				cv::Mat &wm = weights[mi];
				const float *src0 = (float*)wm.ptr(0);
				const float *src1 = (float*)wm.ptr(1);
				const float *src2 = (float*)wm.ptr(2);

				float *dst = weight_flat + (oi * nInputPlanes * 9) + ii;
				dst[0*nInputPlanes] = src0[0];
				dst[1*nInputPlanes] = src0[1];
				dst[2*nInputPlanes] = src0[2];

				dst[3*nInputPlanes] = src1[0];
				dst[4*nInputPlanes] = src1[1];
				dst[5*nInputPlanes] = src1[2];

				dst[6*nInputPlanes] = src2[0];
				dst[7*nInputPlanes] = src2[1];
				dst[8*nInputPlanes] = src2[2];
			}
		}
	} else if (gpu && (nInputPlanes == 3) && (nOutputPlanes == 32)) {
		/* | i0             | i1        | i2 .. iN-1|
		 * |o0 o1 o2 o3..o31|o0 .... o32| ....      |
		 * |<-            ->|
		 * |    32          |
		 * |   x  9         |
		 */

		for (int oi=0; oi<nOutputPlanes; oi++) {
			for (int ii=0; ii<nInputPlanes; ii++) {
				int mi = oi*nInputPlanes+ii;
				cv::Mat &wm = weights[mi];
				const float *src0 = (float*)wm.ptr(0);
				const float *src1 = (float*)wm.ptr(1);
				const float *src2 = (float*)wm.ptr(2);

				float *dst = weight_flat + (ii * nOutputPlanes * 9) + oi;
				dst[0*nOutputPlanes] = src0[0];
				dst[1*nOutputPlanes] = src0[1];
				dst[2*nOutputPlanes] = src0[2];

				dst[3*nOutputPlanes] = src1[0];
				dst[4*nOutputPlanes] = src1[1];
				dst[5*nOutputPlanes] = src1[2];

				dst[6*nOutputPlanes] = src2[0];
				dst[7*nOutputPlanes] = src2[1];
				dst[8*nOutputPlanes] = src2[2];
			}
		}
	} else {
		/* | i0        | i1        | i2 .. iN-1|   i0      | i1        | ..
		 * |o0 o1 o2 o3|o0 o1 o2 o3| ....      |o4 o5 o6 o7|o4 o5 o6 o7| ..
		 * |<-       ->|
		 * | VEC_WIDTH |
		 * |   x  9    |
		 */

		for (int oi=0; oi<nOutputPlanes; oi++) {
			for (int ii=0; ii<nInputPlanes; ii++) {
				int mi = oi*nInputPlanes+ii;
				cv::Mat &wm = weights[mi];
				const float *src0 = (float*)wm.ptr(0);
				const float *src1 = (float*)wm.ptr(1);
				const float *src2 = (float*)wm.ptr(2);

				int oi_0 = oi % vec_width;
				int oi_1 = (oi / vec_width) * vec_width;

				float *dst = weight_flat + ((ii*weight_step + oi_1) * 9) + oi_0;
				dst[0*vec_width] = src0[0];
				dst[1*vec_width] = src0[1];
				dst[2*vec_width] = src0[2];

				dst[3*vec_width] = src1[0];
				dst[4*vec_width] = src1[1];
				dst[5*vec_width] = src1[2];

				dst[6*vec_width] = src2[0];
				dst[7*vec_width] = src2[1];
				dst[8*vec_width] = src2[2];
			}
		}
	}

	bool compare_result = false;

#ifdef COMPARE_RESULT
	if (nOutputPlanes == 3) {
		compare_result = true;
	}
#endif

	size_t in_size = size.width * size.height * sizeof(float) * nInputPlanes;
	size_t out_size = size.width * size.height * sizeof(float) * nOutputPlanes;

	if (compare_result) {
		Buffer *packed_output_cv_buf = new Buffer(env, sizeof(float) * size.width * size.height * nOutputPlanes);

		double t0 = getsec();
		filter_CV(env, packed_input_buf, packed_output_cv_buf, size);
		//filter_FMA_impl(packed_input, packed_output_cv,
		//		nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat, size, nJob);
		double t1 = getsec();

		/* 3x3 = 9 fma */
		double ops = size.width * size.height * 9.0 * 2.0 * nOutputPlanes * nInputPlanes;
		std::vector<cv::Mat> output2;

		if (rt == RUN_OPENCL) {
			filter_OpenCL_impl(env, packed_input_buf, packed_output_buf,
					   nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat,
					   size.width, size.height, nJob);
		} else if (rt == RUN_CUDA) {
			filter_CUDA_impl(env, packed_input_buf, packed_output_buf,
					 nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat,
					 size.width, size.height, nJob);
		} else {
			const float *packed_input = (float*)packed_input_buf->get_read_ptr_host(env, in_size);
			float *packed_output = (float*)packed_output_buf->get_write_ptr_host(env);

			if (have_fma) {
				filter_FMA_impl(env, packed_input, packed_output,
						nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat,
						size.width, size.height, nJob);
			} else {
				filter_AVX_impl(env, packed_input, packed_output,
						nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat,
						size.width, size.height, nJob);
			}
		}

		double t2 = getsec();

		printf("(w=%d,h=%d) (ip=%d,op=%d) %f %f %f[gflops]\n", size.width, size.height, nInputPlanes, nOutputPlanes, t1-t0, t2-t1, ops/(1000*1000*1000));
		printf("ver2 : %f [Gflops]\n", (ops/(1000.0*1000.0*1000.0)) / (t2-t1));
		printf("orig : %f [Gflops]\n", (ops/(1000.0*1000.0*1000.0)) / (t1-t0));
		int error_count = 0;

		float *packed_output_cv = (float*)packed_output_cv_buf->get_read_ptr_host(env, out_size);
		float *packed_output = (float*)packed_output_buf->get_read_ptr_host(env, out_size);

		for (int i=0; i<size.width * size.height * nOutputPlanes; i++) {
			float v0 = packed_output_cv[i];
			float v1 = packed_output[i];
			float d = fabs(v0 - v1);

			float r0 = d/fabs(v0);
			float r1 = d/fabs(v1);

			float r = (std::max)(r0, r1);

			if (r > 0.1f && d > 0.000001f) {
				int plane = i % nOutputPlanes;
				int pixpos = i / nOutputPlanes;
				int xpos = pixpos % size.width;
				int ypos = pixpos / size.width;

				printf("d=%.20f %.20f %.20f @ (%d,%d,%d,%d) \n",r, v0, v1, xpos, ypos, plane, i);
				error_count++;

				if (error_count >= 256) {
					exit(1);
				}
			}
		}

		if (error_count != 0) {
			exit(1);
		}

		delete packed_output_cv_buf;
	} else {
		if (rt == RUN_OPENCL) {
			filter_OpenCL_impl(env,
					   packed_input_buf, packed_output_buf,
					   nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat,
					   size.width, size.height, nJob);
		} else if (rt == RUN_CUDA) {
			filter_CUDA_impl(env,
					 packed_input_buf, packed_output_buf,
					 nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat,
					 size.width, size.height, nJob);
		} else {
			if (!have_avx) {
				filter_CV(env, packed_input_buf, packed_output_buf, size);
			} else {
				const float *packed_input = (float*)packed_input_buf->get_read_ptr_host(env, in_size);
				float *packed_output = (float*)packed_output_buf->get_write_ptr_host(env);

				if (have_fma) {
					filter_FMA_impl(env, packed_input, packed_output,
							nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat,
							size.width, size.height, nJob);
				} else if (have_avx) {
					filter_AVX_impl(env, packed_input, packed_output,
							nInputPlanes, nOutputPlanes, fbiases_flat, weight_flat,
							size.width, size.height, nJob);
				}
			}
		}
	}

	_mm_free(fbiases_flat);
	_mm_free(weight_flat);

	return true;

}