Beispiel #1
0
/* f32x4 mm mul */
void mw_neon_mm_mul_f32x4(float * A, int Row, int T, float * B, int Col, float * C)
{
	int i, k, j;

	float32x4_t neon_b, neon_c;
	float32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	float32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{

		for (k = 0; k < Col; k+=1)
		{
			neon_c = vmovq_n_f32(0);

			for (j = 0; j < T; j+=4)
			{

				int j_T = j * T + i;
				int k_Row = k * Row;

				neon_a0 = vld1q_f32(A + j_T);
				j_T+=Row;
				neon_a1 = vld1q_f32(A + j_T);
				j_T+=Row;
				neon_a2 = vld1q_f32(A + j_T);
				j_T+=Row;
				neon_a3 = vld1q_f32(A + j_T);

				neon_b = vld1q_f32(B + k_Row + j);
				neon_b0 = vdupq_n_f32(vgetq_lane_f32(neon_b, 0));
				neon_b1 = vdupq_n_f32(vgetq_lane_f32(neon_b, 1));
				neon_b2 = vdupq_n_f32(vgetq_lane_f32(neon_b, 2));
				neon_b3 = vdupq_n_f32(vgetq_lane_f32(neon_b, 3));

				neon_c = vaddq_f32(vmulq_f32(neon_a0, neon_b0), neon_c);
				neon_c = vaddq_f32(vmulq_f32(neon_a1, neon_b1), neon_c);
				neon_c = vaddq_f32(vmulq_f32(neon_a2, neon_b2), neon_c);
				neon_c = vaddq_f32(vmulq_f32(neon_a3, neon_b3), neon_c);

				vst1q_lane_f32(C + k_Row + i, neon_c, 0);
				vst1q_lane_f32(C + k_Row + i + 1, neon_c, 1);
				vst1q_lane_f32(C + k_Row + i + 2, neon_c, 2);
				vst1q_lane_f32(C + k_Row + i + 3, neon_c, 3);

			}
		}
	}
}
Beispiel #2
0
void test_vgetQ_lanef32 (void)
{
  float32_t out_float32_t;
  float32x4_t arg0_float32x4_t;

  out_float32_t = vgetq_lane_f32 (arg0_float32x4_t, 1);
}
Beispiel #3
0
void
test_square_root_v4sf ()
{
  const float32_t pool[] = {4.0f, 9.0f, 16.0f, 25.0f};
  float32x4_t val;
  float32x4_t res;

  val = vld1q_f32 (pool);
  res = vsqrtq_f32 (val);

  if (vgetq_lane_f32 (res, 0) != 2.0f)
    abort ();
  if (vgetq_lane_f32 (res, 1) != 3.0f)
    abort ();
  if (vgetq_lane_f32 (res, 2) != 4.0f)
    abort ();
  if (vgetq_lane_f32 (res, 3) != 5.0f)
    abort ();
}
Beispiel #4
0
/* f32x4 mv mul */
void mw_neon_mv_mul_f32x4(float * A, int Row, int T, float * B, float * C)
{
	int i = 0;
	int k = 0;

	float32x4_t neon_b, neon_c;
	float32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	float32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{
		neon_c = vmovq_n_f32(0);

		for (k = 0; k < T; k+=4)
		{
			int j = k * T + i;

			neon_a0 = vld1q_f32(A + j);
			neon_a1 = vld1q_f32(A + j + Row);
			neon_a2 = vld1q_f32(A + j + 2 * Row);
			neon_a3 = vld1q_f32(A + j + 3 * Row);

			neon_b = vld1q_f32(B + k);
			neon_b0 = vdupq_n_f32(vgetq_lane_f32(neon_b, 0));
			neon_b1 = vdupq_n_f32(vgetq_lane_f32(neon_b, 1));
			neon_b2 = vdupq_n_f32(vgetq_lane_f32(neon_b, 2));
			neon_b3 = vdupq_n_f32(vgetq_lane_f32(neon_b, 3));

			neon_c = vaddq_f32(vmulq_f32(neon_a0, neon_b0), neon_c);
			neon_c = vaddq_f32(vmulq_f32(neon_a1, neon_b1), neon_c);
			neon_c = vaddq_f32(vmulq_f32(neon_a2, neon_b2), neon_c);
			neon_c = vaddq_f32(vmulq_f32(neon_a3, neon_b3), neon_c);

		}

		vst1q_f32(C + i, neon_c);
	}
}
Beispiel #5
0
// CHECK-LABEL: test_vgetq_lane_f32:
float32_t test_vgetq_lane_f32(float32x4_t v) {
  return vgetq_lane_f32(v, 0);
// CHECK-NEXT: ret
}
// __INLINE
void arm_cmplx_mult_cmplx_f32_dot(
  float32_t * pSrcA,
  float32_t * pSrcB,
  float32_t * pDst,
  uint32_t numSamples)
{
  float32_t a, b, c, d;                          /* Temporary variables to store real and imaginary values */
  float32x4_t A1, A2;                            /* Temporary variables to store real and imaginary values of source buffer A */
  float32x4_t B1, B2;                            /* Temporary variables to store real and imaginary values of source buffer B */
  float32x4_t C1, C2, C3, C4;                    /* Temporary variables to store multiplication output */
  float32x4x2_t out1, out2, out3, out4;          /* Temporary variables to stroe output result */
  float32x4x2_t acc1, acc2, acc3, acc4;            /* Accumulators */
  float 	 sum_real, sum_img;            		/*  */
  uint32_t blkCnt;                               /* loop counters */

  /* Clear accumulators   VDUP.32 q0,r0
	Vector Duplicate duplicates a scalar into every element of the destination vector.
    */
  acc1.val[0] = vdupq_n_f32(0.0f);
  acc1.val[1] = vdupq_n_f32(0.0f);
  acc2.val[0] = vdupq_n_f32(0.0f);
  acc2.val[1] = vdupq_n_f32(0.0f);
  acc3.val[0] = vdupq_n_f32(0.0f);
  acc3.val[1] = vdupq_n_f32(0.0f);
  acc4.val[0] = vdupq_n_f32(0.0f);
  acc4.val[1] = vdupq_n_f32(0.0f);

  /* Loop over blockSize number of values */
  blkCnt = numSamples >> 4u;

  while(blkCnt > 0u)
  {
  	/*  A1, A2, B1, B2 each has two complex data. */
	
	/******************************************************/
	/* Step 1: Load data A1, A2, B1, B2 for group a:*/
    /* read 2 complex values at a time from source A buffer 
	float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr);
		 VLD1.32 {d0, d1}, [r0]
	*/
    A1 = vld1q_f32(pSrcA);
    /* increment source A buffer by 4 */
    pSrcA += 4u;
    /* read 2 complex values at a time from source A buffer */
    A2 = vld1q_f32(pSrcA);
    /* increment source A buffer by 4 */
    pSrcA += 4u;

    /* read 2 complex values at a time from source B buffer */
    B1 = vld1q_f32(pSrcB);
    /* increment source B buffer by 4 */
    pSrcB += 4u;
    /* read 2 complex values at a time from source B buffer */
    B2 = vld1q_f32(pSrcB);
    /* increment source B buffer by 4 */
    pSrcB += 4u;

	/******************************************************/
	/* Step 2: Unzip data Out1, Out2 for group a:*/
    /* unzip real and imag values
	A1: reala0, imga0, reala1, imga1
	A2: realb0, imgb0, realb1, imgb1
	out1.val0: reala0, reala1, realb0, realb1;
	out1.val1: imga0, imga1, imgb0, imgb1

	vuzpq_f32:
    	float32x4x2_t vuzpq_f32 (float32x4_t, float32x4_t) 
	Form of expected instruction(s): vuzp.32 q0, q1
	Vector Unzip de-interleaves the elements of two vectors. 
	*/
    out1 = vuzpq_f32(A1, A2);
    out2 = vuzpq_f32(B1, B2);

	/******************************************************/
	/* Step 1: Load data A1, A2, B1, B2 for group b:*/
    /* read 2 complex values at a time from source A buffer */
    A1 = vld1q_f32(pSrcA);
    /* increment source A buffer by 4 */
    pSrcA += 4u;
    /* read 2 complex values at a time from source A buffer */
    A2 = vld1q_f32(pSrcA);
    /* increment source A buffer by 4 */
    pSrcA += 4u;

    /* read 2 complex values at a time from source B buffer */
    B1 = vld1q_f32(pSrcB);
    /* increment source B buffer by 4 */
    pSrcB += 4u;
    /* read 2 complex values at a time from source B buffer */
    B2 = vld1q_f32(pSrcB);
    /* increment source B buffer by 4 */
    pSrcB += 4u;

	/******************************************************/
	/* Step 3: Compute data C1,C2,C3,C4 for group a:*/
    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
    /* vmulq_f32: VMUL.F32 q0,q0,q0
		val[0]: real
		val[1]: img
		C1 = a.real*b.real;  		C2 = a.img*b.img
		C3 = a.img*b.real;  		C4 = a.real*b.img
	*/    
	/* multiply 4 samples at a time from A1 real input with B1 real input 	*/
    C1 = vmulq_f32(out1.val[0], out2.val[0]);  
    /* multiply 4 samples at a time from A1 imaginary input with B1 imaginary input */
    C2 = vmulq_f32(out1.val[1], out2.val[1]);
    /* multiply 4 samples at a time from A1 imaginary input with B1 real input */
    C3 = vmulq_f32(out1.val[1], out2.val[0]);
    /* multiply 4 samples at a time from A1 real input with B1 imaginary input */
    C4 = vmulq_f32(out1.val[0], out2.val[1]);
	/*  real: c1-c2; img: c3+c4 */

	/******************************************************/
	/* Step 2: Unzip data Out2, Out3 for group b:*/
    out2 = vuzpq_f32(A1, A2);
    out3 = vuzpq_f32(B1, B2);

	/******************************************************/
	/* Step 1: Load data A1, A2 for group c:*/
    /* read 2 complex values at a time from source A buffer */
    A1 = vld1q_f32(pSrcA);
    /* increment source A buffer by 4 */
    pSrcA += 4u;
    /* read 2 complex values at a time from source A buffer */
    A2 = vld1q_f32(pSrcA);
    /* increment source A buffer by 4 */
    pSrcA += 4u;

	/******************************************************/
	/* Step 4: Output or accumlate data for group a:*/
	/*  (a+bi)*(c+di) = (ac-bd)+(ad+bc)i*/
	/*  real: c1-c2; img: c3+c4 */
    /* subtract 4 samples at time from real result to imaginary result, got four real part */
	/*  
		C1 = a.real*b.real; 		C2 = a.img*b.img
		C3 = a.img*b.real;		C4 = a.real*b.img

		vaddq_f32: 
		VADD.F32 q0,q0,q0
	*/
    out1.val[0] = vsubq_f32(C1, C2);
	acc1.val[0] = vaddq_f32(out1.val[0], acc1.val[0]);  /* add by Hank */
    /* add real*imaginary result with imaginary*real result 4 at a time */
    out1.val[1] = vaddq_f32(C3, C4);
	acc1.val[1] = vaddq_f32(out1.val[1], acc1.val[1]); /* add by Hank */
	/* out1 is four complex product. */

	/******************************************************/
	/* Step 1: Load data B1, B2 for group c:*/
    /* read 2 complex values at a time from source B buffer */
    B1 = vld1q_f32(pSrcB);
    /* increment source B buffer by 4 */
    pSrcB += 4u;
    /* read 2 complex values at a time from source B buffer */
    B2 = vld1q_f32(pSrcB);
    /* increment source B buffer by 4 */
    pSrcB += 4u;

	/******************************************************/
	/* Step 3: Compute data C1,C2   for group b:*/
    /* multiply 4 samples at a time from A1 real input with B1 real input */
    C1 = vmulq_f32(out2.val[0], out3.val[0]);
    /* multiply 4 samples at a time from A1 imaginary input with B1 imaginary input */
    C2 = vmulq_f32(out2.val[1], out3.val[1]);

	/******************************************************/
	/* Step 5: Store data for group a:*/
    /* Store 4 complex samples to destination buffer
             VST2.32 {d0, d2}, [r0]   */
    //vst2q_f32(pDst, out1);
    /* increment destination buffer by 8 */
    //pDst += 8u;

	/******************************************************/
	/* Step 3: Compute data  C3,C4 for group b:*/
    /* multiply 4 samples at a time from A1 imaginary input with B1 real input */
    C3 = vmulq_f32(out2.val[1], out3.val[0]);
    
                           /* multiply 4 samples at a time from A1 real input with B1 imaginary input */
	C4 = vmulq_f32(out2.val[0], out3.val[1]);

	/******************************************************/
	/* Step 2: Unzip data Out1, Out2 for group C:*/
    out3 = vuzpq_f32(A1, A2);
    out4 = vuzpq_f32(B1, B2);

	/******************************************************/
	/* Step 1: Load data A1, A2, B1, B2 for group d:*/
    /* read 4 complex values from source A buffer */
    A1 = vld1q_f32(pSrcA);
    pSrcA += 4u;
    A2 = vld1q_f32(pSrcA);
    pSrcA += 4u;

    /* read 4 complex values from source B buffer */
    B1 = vld1q_f32(pSrcB);
    pSrcB += 4u;
    B2 = vld1q_f32(pSrcB);
    pSrcB += 4u;

	/******************************************************/
	/* Step 4: Output or accumlate data for group b:*/
    /* subtract 4 samples at time from real result to imaginary result */
    out2.val[0] = vsubq_f32(C1, C2);
    /* add real*imaginary result with imaginary*real result 4 at a time */
    out2.val[1] = vaddq_f32(C3, C4);
	acc2.val[0] = vaddq_f32(out2.val[0], acc2.val[0]);  /* add by Hank */
	acc2.val[1] = vaddq_f32(out2.val[1], acc2.val[1]); /* add by Hank */

	/******************************************************/
	/* Step 3: Compute data C1,C2,C3,C4 for group c:*/
    /* multiply 4 samples at a time from A3 real input with B3 real input */
    C1 = vmulq_f32(out3.val[0], out4.val[0]);
    /* multiply 4 samples at a time from A3 imaginary input with B3 imaginary input */
    C2 = vmulq_f32(out3.val[1], out4.val[1]);
    /* multiply 4 samples at a time from A3 imaginary input with B3 real input */
    C3 = vmulq_f32(out3.val[1], out4.val[0]);
    /* multiply 4 samples at a time from A3 real input with B3 imaginary input */
    C4 = vmulq_f32(out3.val[0], out4.val[1]);

	/******************************************************/
	/* Step 2: Unzip data Out1, Out2 for group D:*/
    out1 = vuzpq_f32(A1, A2);
    out4 = vuzpq_f32(B1, B2);

	/******************************************************/
	/* Step 5: Store data for group b:*/
    /* Store 4 complex samples to destination buffer */
    //vst2q_f32(pDst, out2);
    /* increment destination buffer by 8 */
    //pDst += 8u;

	/******************************************************/
	/* Step 4: Output or accumlate data for group c:*/
    /* subtract 4 samples at time from real result to imaginary result */
    out3.val[0] = vsubq_f32(C1, C2);
    /* add real*imaginary result with imaginary*real result 4 at a time */
    out3.val[1] = vaddq_f32(C3, C4);
	acc3.val[0] = vaddq_f32(out3.val[0], acc3.val[0]);  /* add by Hank */
	acc3.val[1] = vaddq_f32(out3.val[1], acc3.val[1]); /* add by Hank */

	/******************************************************/
	/* Step 3: Compute data C1,C2,C3,C4 for group d:*/
    /* multiply 4 samples at a time from A1 real input with B1 real input */
    C1 = vmulq_f32(out1.val[0], out4.val[0]);
    /* multiply 4 samples at a time from A1 imaginary input with B1 imaginary input */
    C2 = vmulq_f32(out1.val[1], out4.val[1]);

    /* multiply 4 samples at a time from A1 imaginary input with B1 real input */
    C3 = vmulq_f32(out1.val[1], out4.val[0]);
    /* multiply 4 samples at a time from A1 real input with B1 imaginary input */
    C4 = vmulq_f32(out1.val[0], out4.val[1]);

	/******************************************************/
	/* Step 5: Store data for group c:*/
    /* Store 4 complex samples to destination buffer */
    //vst2q_f32(pDst, out3);

	/******************************************************/
	/* Step 4: Output or accumlate data for group d:*/
    /* subtract 4 samples at time from real result to imaginary result */
    out4.val[0] = vsubq_f32(C1, C2);

    /* increment destination buffer by 8 */
    //pDst += 8u;

	/******************************************************/
	/* Step 4: Output or accumlate data for group d:*/
    /* add real*imaginary result with imaginary*real result 4 at a time */
    out4.val[1] = vaddq_f32(C3, C4);
	acc4.val[0] = vaddq_f32(out4.val[0], acc4.val[0]);  /* add by Hank */
	acc4.val[1] = vaddq_f32(out4.val[1], acc4.val[1]); /* add by Hank */

    /* zip real and imag values */
    //out4 = vzipq_f32(out4.val[0], out4.val[1]);

	/******************************************************/
	/* Step 5: Store data for group d:*/
    /* Store 4 complex samples to destination buffer */
    //vst1q_f32(pDst, out4.val[0]);
    //pDst += 4u;
    //vst1q_f32(pDst, out4.val[1]);
    //pDst += 4u;

    /* Decrement the numSamples loop counter */
    blkCnt--;
  }

  blkCnt = numSamples & 15u;
  blkCnt = blkCnt >> 2u;

  /* If the blockSize is not a multiple of 16, compute remaining output samples.     
   ** Compute multiple of 4 samples at a time in second loop.  
   ** and remaining 1 to 3 samples in third loop. */
  while(blkCnt > 0u)
  {
	/* Step 1: Load data A1, A2, B1, B2 */
	    /* read 4 complex values at a time from source A buffer */
	    A1 = vld1q_f32(pSrcA);
	    /* increment source A buffer by 8 */
	    pSrcA += 4u;
	    A2 = vld1q_f32(pSrcA);
	    pSrcA += 4u;
	    /* read 4 complex values at a time from source B buffer */
	    B1 = vld1q_f32(pSrcB);
	    /* increment source B buffer by 8 */
	    pSrcB += 4u;
	    B2 = vld1q_f32(pSrcB);
	    pSrcB += 4u;

	/* Step 2: Unzip data Out1, Out2 */
		/* Unzip data */
	    out1 = vuzpq_f32(A1, A2);
	    out2 = vuzpq_f32(B1, B2);

	/* Step 3: Compute data C1,C2,C3,C4 */
	    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
	    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
	    /* multiply 4 samples at a time from A1 real input with B1 real input */
	    C1 = vmulq_f32(out1.val[0], out2.val[0]);
	    /* multiply 4 samples at a time from A1 imaginary input with B1 imaginary input */
	    C2 = vmulq_f32(out1.val[1], out2.val[1]);
	    /* multiply 4 samples at a time from A1 imaginary input with B1 real input */
	    C3 = vmulq_f32(out1.val[1], out2.val[0]);
	    /* multiply 4 samples at a time from A1 real input with B1 imaginary input */
	    C4 = vmulq_f32(out1.val[0], out2.val[1]);

	/* Step 4: Output or accumlate data for group d:*/
	    /* subtract 4 samples at time from real result to imaginary result */
	    out1.val[0] = vsubq_f32(C1, C2);
	    /* add real*imaginary result with imaginary*real result 4 at a time */
	    out1.val[1] = vaddq_f32(C3, C4);
		acc1.val[0] = vaddq_f32(out1.val[0], acc1.val[0]);  /* add by Hank */
		acc1.val[1] = vaddq_f32(out1.val[1], acc1.val[1]); /* add by Hank */

	    //out1 = vzipq_f32(out1.val[0], out1.val[1]);

	/* Step 5: Store data */
	    /* Store 4 complex samples to destination buffer */
	    //vst1q_f32(pDst, out1.val[0]);
	    //pDst += 4u;
	    //vst1q_f32(pDst, out1.val[1]);
	    //pDst += 4u;

    /* Decrement the numSamples loop counter */
    blkCnt--;
  }

  blkCnt = numSamples & 3u;

  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.     
   ** No intrinsics is used. */
  sum_real =0;
  sum_img =0;
  while(blkCnt > 0u)
  {
    /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1].  */
    /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i].  */
    a = *pSrcA++;
    b = *pSrcA++;
    c = *pSrcB++;
    d = *pSrcB++;

    /* store the result in the destination buffer. */
    sum_real += ((a * c) - (b * d));
    sum_img += ((a * d) + (b * c));

    /* Decrement the numSamples loop counter */
    blkCnt--;
  }

	/* add 4 accumulators */
	acc1.val[0] = vaddq_f32(acc1.val[0], acc2.val[0]);
	acc1.val[1] = vaddq_f32(acc1.val[1], acc2.val[1]);
	acc2.val[0] = vaddq_f32(acc3.val[0], acc4.val[0]);
	acc2.val[1] = vaddq_f32(acc3.val[1], acc4.val[1]);
	acc1.val[0] = vaddq_f32(acc1.val[0], acc2.val[0]);
	acc1.val[1] = vaddq_f32(acc1.val[1], acc2.val[1]);

	sum_real += vgetq_lane_f32(acc1.val[0], 0) + vgetq_lane_f32(acc1.val[0], 1)
		+ vgetq_lane_f32(acc1.val[0], 2) + vgetq_lane_f32(acc1.val[0], 3);
	sum_img += vgetq_lane_f32(acc1.val[1], 0) + vgetq_lane_f32(acc1.val[1], 1)
		+ vgetq_lane_f32(acc1.val[1], 2) + vgetq_lane_f32(acc1.val[1], 3);

	*pDst++=sum_real;
	*pDst++=sum_img;
/* useful when debuggin.. */
void print4(float32x4_t v) {
  /* float *p = (float*)&v; */    /* Commented out to avoid compiler warning of unused variable */
  printf("[%13.8g, %13.8g, %13.8g, %13.8g]", vgetq_lane_f32(v,0), vgetq_lane_f32(v, 1), vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3));
}
Beispiel #8
0
float32_t test_vgetq_lane_f32(float32x4_t a) {
  // CHECK-LABEL: test_vgetq_lane_f32:
  // CHECK-NEXT:  mov s0, v0[3]
  // CHECK-NEXT:  ret
  return vgetq_lane_f32(a, 3);
}
Beispiel #9
0
/* useful when debuggin.. */
void print4(float32x4_t v) {
  float *p = (float*)&v; 
  printf("[%13.8g, %13.8g, %13.8g, %13.8g]", vgetq_lane_f32(v,0), vgetq_lane_f32(v, 1), vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3));
}
/*******************************************************************************
* PROCEDURE: gaussian_smooth
* PURPOSE: Blur an image with a gaussian filter.
* NAME: Mike Heath
* DATE: 2/15/96
*******************************************************************************/
short int* gaussian_smooth(unsigned char *image, int rows, int cols, float sigma)
{
    int r, c, rr, cc,     /* Counter variables. */
        windowsize,        /* Dimension of the gaussian kernel. */
        center;            /* Half of the windowsize. */
    float *tempim,*tempim1,        /* Buffer for separable filter gaussian smoothing. */
          *kernel,        /* A one dimensional gaussian kernel. */
          dot,            /* Dot product summing variable. */
          sum;            /* Sum of the kernel weights variable. */

    /****************************************************************************
    * Create a 1-dimensional gaussian smoothing kernel.
    ****************************************************************************/
    if(VERBOSE) printf("   Computing the gaussian smoothing kernel.\n");
    make_gaussian_kernel(sigma, &kernel, &windowsize);
    center = windowsize / 2;


    /****************************************************************************
    * Allocate a temporary buffer image and the smoothed image.
    ****************************************************************************/
    if((tempim = (float *) malloc(rows*cols* sizeof(float))) == NULL)
    {
        fprintf(stderr, "Error allocating the buffer image.\n");
        exit(1);
    }
    short int* smoothedim;

    if(((smoothedim) = (short int *) malloc(rows*cols*sizeof(short int))) == NULL)
    {
        fprintf(stderr, "Error allocating the smoothed image.\n");
        exit(1);
    }
    startTimer(&totalTime);
    //Neon impelementation of gaussian smooth starts here
    /****************************************************************************
    * Blur in the x - direction.
    ****************************************************************************/
	int loop; 	
	int floop;
    //Modification of input image for neon implementation
    //For Filter 1
	float * new_image;
    //For Filter 2
	float *new_image_col;
    //kernel is changed to 17 from 15 for neon (two 0s at the beginning and the end)
	float new_kernel[17];

    //Generating now kernel filter
	for (floop = 0 ; floop < 17 ; floop++)
	{
		if(floop == 0 || floop == 16 )
			new_kernel[floop] = 0 ;
		else
			new_kernel [floop] = kernel[floop -1];	
	}
    //For filter 1, new cols number for neon
	unsigned int new_cols;
	new_cols=cols+16;
	unsigned int i, k; 
	unsigned int a; 
	unsigned int m; 
	unsigned int n, j;

    //Malloc of new image used by neon
	new_image = (float*)malloc(new_cols*rows*sizeof(float));
	for( i =0; i<rows; i++){
		memset(&new_image[i*new_cols],0,8*sizeof(float));

		for( k=0; k<cols;k++){
			new_image[i*new_cols+8+k] = (float)image[i*cols+k];
		}
		memset(&new_image[i*new_cols+8+cols],0,8*sizeof(float));
	}
    // Neon handles four piexel at a time
  	float32x4_t neon_input;
	float32x4_t neon_filter;
	float32x4_t temp_sum;
	float32x2_t tempUpper;
	float32x2_t tempLower; 
	float32_t zero = 0;
	float32_t temp_output;
	float Basekernel = 0.0f;
	float kernelSum;

    //When using the new filter, we always assume the image has more than 9 pixels in a row
    //Base sum for the filter
	for( a=8; a<=16; a++){
		Basekernel += new_kernel[a];
	}

    //Filter 1, filtering row by row
	for(m=0; m<rows; m++){
		for( n=0; n<cols; n++){
			temp_sum = vdupq_n_f32(0);
			if(n==0){
				kernelSum = Basekernel;
			}
			else if(n <=8){
				kernelSum += new_kernel[8-n];
			}
			else if(n>=cols-8){
				kernelSum -=new_kernel[cols-n+8];
			}

            //For each pixel, filtering is performed four times
			for( j=0; j<4; j++)
			{
				int kk=0;
				if(j>=2)
				{
					kk=1;
				}
				neon_input = vld1q_f32(&new_image[m*new_cols+n+j*4+kk]);
				neon_filter = vld1q_f32(&new_kernel[j*4+kk]);
				temp_sum = vmlaq_f32(temp_sum,neon_input,neon_filter);
			}
			
			unsigned int t;
	
			for( t=0; t<=3; t++){	
						
				temp_output += vgetq_lane_f32(temp_sum,t ); 

			}
			temp_output += new_image[m*new_cols+n+8] * new_kernel[8];
			temp_output /= kernelSum;
			tempim[m*cols+n] = temp_output;
			temp_output=0; 
		}
	}

   	
     for(r=0; r<rows; r++)
     {
         for(c=0; c<cols; c++)
         {
             dot = 0.0;
             sum = 0.0;
             for(cc=(-center); cc<=center; cc++)
             {
             	   if(((c+cc) >= 0) && ((c+cc) < cols))
                 {
                    dot += (float)image[r*cols+(c+cc)] * kernel[center+cc];
                     sum += kernel[center+cc];
                 }
             }
             tempim1[r*cols+c] = dot/sum;
         }
     }

    /****************************************************************************
    * Blur in the y - direction.
    ****************************************************************************/

    unsigned int new_rows;
	new_rows=rows+16;
	new_image_col = (float*)malloc(new_rows*cols*sizeof(float));
	if(VERBOSE) printf("   Bluring the image in the Y-direction.\n");

	for( i =0; i<cols; i++){//actually nember of new rows are the number of columns here 
		memset(&new_image_col[i*new_rows],0,8*sizeof(float));

		for( k=0; k<rows;k++){
			new_image_col[i*new_rows+8+k] = tempim[k*cols+i];
			//new_image_col[i*new_rows+8+k] = imagetest1[k*cols+i];
		}
		memset(&new_image_col[i*new_rows+8+rows],0,8*sizeof(float));
	}

	Basekernel = 0.0; 
	for( a=8; a<=16; a++){
		Basekernel += new_kernel[a];
	}

	for(m=0; m<cols; m++){// it was rows at br
		for( n=0; n<rows; n++){
			temp_sum = vdupq_n_f32(0);
			if(n==0){
				kernelSum = Basekernel;
			}
			else if(n <=8){
				kernelSum += new_kernel[8-n];
			}
			else if(n>=rows-8){
				kernelSum -=new_kernel[rows-n+8];
			}

			for( j=0; j<4; j++)
			{
				int kk=0;
				if(j>=2)
				{
					kk=1;
				}
				neon_input = vld1q_f32(&new_image_col[m*new_rows+n+j*4+kk]);
			 	neon_filter = vld1q_f32(&new_kernel[j*4+kk]);
				temp_sum = vmlaq_f32(temp_sum,neon_input,neon_filter);
			}
			
			unsigned int t;
			for( t=0; t<=3; t++){	
						
				temp_output += vgetq_lane_f32(temp_sum,t ); 
			}
			temp_output += new_image_col[m*new_rows+n+8] * new_kernel[8];
			temp_output = (temp_output * BOOSTBLURFACTOR) / kernelSum + 0.5;
			
			 smoothedim[n*cols+m] = (short int )temp_output;
			temp_output=0; 
		}
	}
    stopTimer(&totalTime);
    printTimer(&totalTime);
    
    free(tempim);
    free(kernel);
    return smoothedim;
}
Beispiel #11
0
// CHECK-LABEL: define float @test_vgetq_lane_f32(<4 x float> %a) #0 {
// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
// CHECK:   ret float [[VGETQ_LANE]]
float32_t test_vgetq_lane_f32(float32x4_t a) {
  return vgetq_lane_f32(a, 3);
}