/* f32x4 mm mul */ void mw_neon_mm_mul_f32x4(float * A, int Row, int T, float * B, int Col, float * C) { int i, k, j; float32x4_t neon_b, neon_c; float32x4_t neon_a0, neon_a1, neon_a2, neon_a3; float32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { for (k = 0; k < Col; k+=1) { neon_c = vmovq_n_f32(0); for (j = 0; j < T; j+=4) { int j_T = j * T + i; int k_Row = k * Row; neon_a0 = vld1q_f32(A + j_T); j_T+=Row; neon_a1 = vld1q_f32(A + j_T); j_T+=Row; neon_a2 = vld1q_f32(A + j_T); j_T+=Row; neon_a3 = vld1q_f32(A + j_T); neon_b = vld1q_f32(B + k_Row + j); neon_b0 = vdupq_n_f32(vgetq_lane_f32(neon_b, 0)); neon_b1 = vdupq_n_f32(vgetq_lane_f32(neon_b, 1)); neon_b2 = vdupq_n_f32(vgetq_lane_f32(neon_b, 2)); neon_b3 = vdupq_n_f32(vgetq_lane_f32(neon_b, 3)); neon_c = vaddq_f32(vmulq_f32(neon_a0, neon_b0), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a1, neon_b1), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a2, neon_b2), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a3, neon_b3), neon_c); vst1q_lane_f32(C + k_Row + i, neon_c, 0); vst1q_lane_f32(C + k_Row + i + 1, neon_c, 1); vst1q_lane_f32(C + k_Row + i + 2, neon_c, 2); vst1q_lane_f32(C + k_Row + i + 3, neon_c, 3); } } } }
void test_vgetQ_lanef32 (void) { float32_t out_float32_t; float32x4_t arg0_float32x4_t; out_float32_t = vgetq_lane_f32 (arg0_float32x4_t, 1); }
void test_square_root_v4sf () { const float32_t pool[] = {4.0f, 9.0f, 16.0f, 25.0f}; float32x4_t val; float32x4_t res; val = vld1q_f32 (pool); res = vsqrtq_f32 (val); if (vgetq_lane_f32 (res, 0) != 2.0f) abort (); if (vgetq_lane_f32 (res, 1) != 3.0f) abort (); if (vgetq_lane_f32 (res, 2) != 4.0f) abort (); if (vgetq_lane_f32 (res, 3) != 5.0f) abort (); }
/* f32x4 mv mul */ void mw_neon_mv_mul_f32x4(float * A, int Row, int T, float * B, float * C) { int i = 0; int k = 0; float32x4_t neon_b, neon_c; float32x4_t neon_a0, neon_a1, neon_a2, neon_a3; float32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { neon_c = vmovq_n_f32(0); for (k = 0; k < T; k+=4) { int j = k * T + i; neon_a0 = vld1q_f32(A + j); neon_a1 = vld1q_f32(A + j + Row); neon_a2 = vld1q_f32(A + j + 2 * Row); neon_a3 = vld1q_f32(A + j + 3 * Row); neon_b = vld1q_f32(B + k); neon_b0 = vdupq_n_f32(vgetq_lane_f32(neon_b, 0)); neon_b1 = vdupq_n_f32(vgetq_lane_f32(neon_b, 1)); neon_b2 = vdupq_n_f32(vgetq_lane_f32(neon_b, 2)); neon_b3 = vdupq_n_f32(vgetq_lane_f32(neon_b, 3)); neon_c = vaddq_f32(vmulq_f32(neon_a0, neon_b0), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a1, neon_b1), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a2, neon_b2), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a3, neon_b3), neon_c); } vst1q_f32(C + i, neon_c); } }
// CHECK-LABEL: test_vgetq_lane_f32: float32_t test_vgetq_lane_f32(float32x4_t v) { return vgetq_lane_f32(v, 0); // CHECK-NEXT: ret }
// __INLINE void arm_cmplx_mult_cmplx_f32_dot( float32_t * pSrcA, float32_t * pSrcB, float32_t * pDst, uint32_t numSamples) { float32_t a, b, c, d; /* Temporary variables to store real and imaginary values */ float32x4_t A1, A2; /* Temporary variables to store real and imaginary values of source buffer A */ float32x4_t B1, B2; /* Temporary variables to store real and imaginary values of source buffer B */ float32x4_t C1, C2, C3, C4; /* Temporary variables to store multiplication output */ float32x4x2_t out1, out2, out3, out4; /* Temporary variables to stroe output result */ float32x4x2_t acc1, acc2, acc3, acc4; /* Accumulators */ float sum_real, sum_img; /* */ uint32_t blkCnt; /* loop counters */ /* Clear accumulators VDUP.32 q0,r0 Vector Duplicate duplicates a scalar into every element of the destination vector. */ acc1.val[0] = vdupq_n_f32(0.0f); acc1.val[1] = vdupq_n_f32(0.0f); acc2.val[0] = vdupq_n_f32(0.0f); acc2.val[1] = vdupq_n_f32(0.0f); acc3.val[0] = vdupq_n_f32(0.0f); acc3.val[1] = vdupq_n_f32(0.0f); acc4.val[0] = vdupq_n_f32(0.0f); acc4.val[1] = vdupq_n_f32(0.0f); /* Loop over blockSize number of values */ blkCnt = numSamples >> 4u; while(blkCnt > 0u) { /* A1, A2, B1, B2 each has two complex data. */ /******************************************************/ /* Step 1: Load data A1, A2, B1, B2 for group a:*/ /* read 2 complex values at a time from source A buffer float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); VLD1.32 {d0, d1}, [r0] */ A1 = vld1q_f32(pSrcA); /* increment source A buffer by 4 */ pSrcA += 4u; /* read 2 complex values at a time from source A buffer */ A2 = vld1q_f32(pSrcA); /* increment source A buffer by 4 */ pSrcA += 4u; /* read 2 complex values at a time from source B buffer */ B1 = vld1q_f32(pSrcB); /* increment source B buffer by 4 */ pSrcB += 4u; /* read 2 complex values at a time from source B buffer */ B2 = vld1q_f32(pSrcB); /* increment source B buffer by 4 */ pSrcB += 4u; /******************************************************/ /* Step 2: Unzip data Out1, Out2 for group a:*/ /* unzip real and imag values A1: reala0, imga0, reala1, imga1 A2: realb0, imgb0, realb1, imgb1 out1.val0: reala0, reala1, realb0, realb1; out1.val1: imga0, imga1, imgb0, imgb1 vuzpq_f32: float32x4x2_t vuzpq_f32 (float32x4_t, float32x4_t) Form of expected instruction(s): vuzp.32 q0, q1 Vector Unzip de-interleaves the elements of two vectors. */ out1 = vuzpq_f32(A1, A2); out2 = vuzpq_f32(B1, B2); /******************************************************/ /* Step 1: Load data A1, A2, B1, B2 for group b:*/ /* read 2 complex values at a time from source A buffer */ A1 = vld1q_f32(pSrcA); /* increment source A buffer by 4 */ pSrcA += 4u; /* read 2 complex values at a time from source A buffer */ A2 = vld1q_f32(pSrcA); /* increment source A buffer by 4 */ pSrcA += 4u; /* read 2 complex values at a time from source B buffer */ B1 = vld1q_f32(pSrcB); /* increment source B buffer by 4 */ pSrcB += 4u; /* read 2 complex values at a time from source B buffer */ B2 = vld1q_f32(pSrcB); /* increment source B buffer by 4 */ pSrcB += 4u; /******************************************************/ /* Step 3: Compute data C1,C2,C3,C4 for group a:*/ /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */ /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */ /* vmulq_f32: VMUL.F32 q0,q0,q0 val[0]: real val[1]: img C1 = a.real*b.real; C2 = a.img*b.img C3 = a.img*b.real; C4 = a.real*b.img */ /* multiply 4 samples at a time from A1 real input with B1 real input */ C1 = vmulq_f32(out1.val[0], out2.val[0]); /* multiply 4 samples at a time from A1 imaginary input with B1 imaginary input */ C2 = vmulq_f32(out1.val[1], out2.val[1]); /* multiply 4 samples at a time from A1 imaginary input with B1 real input */ C3 = vmulq_f32(out1.val[1], out2.val[0]); /* multiply 4 samples at a time from A1 real input with B1 imaginary input */ C4 = vmulq_f32(out1.val[0], out2.val[1]); /* real: c1-c2; img: c3+c4 */ /******************************************************/ /* Step 2: Unzip data Out2, Out3 for group b:*/ out2 = vuzpq_f32(A1, A2); out3 = vuzpq_f32(B1, B2); /******************************************************/ /* Step 1: Load data A1, A2 for group c:*/ /* read 2 complex values at a time from source A buffer */ A1 = vld1q_f32(pSrcA); /* increment source A buffer by 4 */ pSrcA += 4u; /* read 2 complex values at a time from source A buffer */ A2 = vld1q_f32(pSrcA); /* increment source A buffer by 4 */ pSrcA += 4u; /******************************************************/ /* Step 4: Output or accumlate data for group a:*/ /* (a+bi)*(c+di) = (ac-bd)+(ad+bc)i*/ /* real: c1-c2; img: c3+c4 */ /* subtract 4 samples at time from real result to imaginary result, got four real part */ /* C1 = a.real*b.real; C2 = a.img*b.img C3 = a.img*b.real; C4 = a.real*b.img vaddq_f32: VADD.F32 q0,q0,q0 */ out1.val[0] = vsubq_f32(C1, C2); acc1.val[0] = vaddq_f32(out1.val[0], acc1.val[0]); /* add by Hank */ /* add real*imaginary result with imaginary*real result 4 at a time */ out1.val[1] = vaddq_f32(C3, C4); acc1.val[1] = vaddq_f32(out1.val[1], acc1.val[1]); /* add by Hank */ /* out1 is four complex product. */ /******************************************************/ /* Step 1: Load data B1, B2 for group c:*/ /* read 2 complex values at a time from source B buffer */ B1 = vld1q_f32(pSrcB); /* increment source B buffer by 4 */ pSrcB += 4u; /* read 2 complex values at a time from source B buffer */ B2 = vld1q_f32(pSrcB); /* increment source B buffer by 4 */ pSrcB += 4u; /******************************************************/ /* Step 3: Compute data C1,C2 for group b:*/ /* multiply 4 samples at a time from A1 real input with B1 real input */ C1 = vmulq_f32(out2.val[0], out3.val[0]); /* multiply 4 samples at a time from A1 imaginary input with B1 imaginary input */ C2 = vmulq_f32(out2.val[1], out3.val[1]); /******************************************************/ /* Step 5: Store data for group a:*/ /* Store 4 complex samples to destination buffer VST2.32 {d0, d2}, [r0] */ //vst2q_f32(pDst, out1); /* increment destination buffer by 8 */ //pDst += 8u; /******************************************************/ /* Step 3: Compute data C3,C4 for group b:*/ /* multiply 4 samples at a time from A1 imaginary input with B1 real input */ C3 = vmulq_f32(out2.val[1], out3.val[0]); /* multiply 4 samples at a time from A1 real input with B1 imaginary input */ C4 = vmulq_f32(out2.val[0], out3.val[1]); /******************************************************/ /* Step 2: Unzip data Out1, Out2 for group C:*/ out3 = vuzpq_f32(A1, A2); out4 = vuzpq_f32(B1, B2); /******************************************************/ /* Step 1: Load data A1, A2, B1, B2 for group d:*/ /* read 4 complex values from source A buffer */ A1 = vld1q_f32(pSrcA); pSrcA += 4u; A2 = vld1q_f32(pSrcA); pSrcA += 4u; /* read 4 complex values from source B buffer */ B1 = vld1q_f32(pSrcB); pSrcB += 4u; B2 = vld1q_f32(pSrcB); pSrcB += 4u; /******************************************************/ /* Step 4: Output or accumlate data for group b:*/ /* subtract 4 samples at time from real result to imaginary result */ out2.val[0] = vsubq_f32(C1, C2); /* add real*imaginary result with imaginary*real result 4 at a time */ out2.val[1] = vaddq_f32(C3, C4); acc2.val[0] = vaddq_f32(out2.val[0], acc2.val[0]); /* add by Hank */ acc2.val[1] = vaddq_f32(out2.val[1], acc2.val[1]); /* add by Hank */ /******************************************************/ /* Step 3: Compute data C1,C2,C3,C4 for group c:*/ /* multiply 4 samples at a time from A3 real input with B3 real input */ C1 = vmulq_f32(out3.val[0], out4.val[0]); /* multiply 4 samples at a time from A3 imaginary input with B3 imaginary input */ C2 = vmulq_f32(out3.val[1], out4.val[1]); /* multiply 4 samples at a time from A3 imaginary input with B3 real input */ C3 = vmulq_f32(out3.val[1], out4.val[0]); /* multiply 4 samples at a time from A3 real input with B3 imaginary input */ C4 = vmulq_f32(out3.val[0], out4.val[1]); /******************************************************/ /* Step 2: Unzip data Out1, Out2 for group D:*/ out1 = vuzpq_f32(A1, A2); out4 = vuzpq_f32(B1, B2); /******************************************************/ /* Step 5: Store data for group b:*/ /* Store 4 complex samples to destination buffer */ //vst2q_f32(pDst, out2); /* increment destination buffer by 8 */ //pDst += 8u; /******************************************************/ /* Step 4: Output or accumlate data for group c:*/ /* subtract 4 samples at time from real result to imaginary result */ out3.val[0] = vsubq_f32(C1, C2); /* add real*imaginary result with imaginary*real result 4 at a time */ out3.val[1] = vaddq_f32(C3, C4); acc3.val[0] = vaddq_f32(out3.val[0], acc3.val[0]); /* add by Hank */ acc3.val[1] = vaddq_f32(out3.val[1], acc3.val[1]); /* add by Hank */ /******************************************************/ /* Step 3: Compute data C1,C2,C3,C4 for group d:*/ /* multiply 4 samples at a time from A1 real input with B1 real input */ C1 = vmulq_f32(out1.val[0], out4.val[0]); /* multiply 4 samples at a time from A1 imaginary input with B1 imaginary input */ C2 = vmulq_f32(out1.val[1], out4.val[1]); /* multiply 4 samples at a time from A1 imaginary input with B1 real input */ C3 = vmulq_f32(out1.val[1], out4.val[0]); /* multiply 4 samples at a time from A1 real input with B1 imaginary input */ C4 = vmulq_f32(out1.val[0], out4.val[1]); /******************************************************/ /* Step 5: Store data for group c:*/ /* Store 4 complex samples to destination buffer */ //vst2q_f32(pDst, out3); /******************************************************/ /* Step 4: Output or accumlate data for group d:*/ /* subtract 4 samples at time from real result to imaginary result */ out4.val[0] = vsubq_f32(C1, C2); /* increment destination buffer by 8 */ //pDst += 8u; /******************************************************/ /* Step 4: Output or accumlate data for group d:*/ /* add real*imaginary result with imaginary*real result 4 at a time */ out4.val[1] = vaddq_f32(C3, C4); acc4.val[0] = vaddq_f32(out4.val[0], acc4.val[0]); /* add by Hank */ acc4.val[1] = vaddq_f32(out4.val[1], acc4.val[1]); /* add by Hank */ /* zip real and imag values */ //out4 = vzipq_f32(out4.val[0], out4.val[1]); /******************************************************/ /* Step 5: Store data for group d:*/ /* Store 4 complex samples to destination buffer */ //vst1q_f32(pDst, out4.val[0]); //pDst += 4u; //vst1q_f32(pDst, out4.val[1]); //pDst += 4u; /* Decrement the numSamples loop counter */ blkCnt--; } blkCnt = numSamples & 15u; blkCnt = blkCnt >> 2u; /* If the blockSize is not a multiple of 16, compute remaining output samples. ** Compute multiple of 4 samples at a time in second loop. ** and remaining 1 to 3 samples in third loop. */ while(blkCnt > 0u) { /* Step 1: Load data A1, A2, B1, B2 */ /* read 4 complex values at a time from source A buffer */ A1 = vld1q_f32(pSrcA); /* increment source A buffer by 8 */ pSrcA += 4u; A2 = vld1q_f32(pSrcA); pSrcA += 4u; /* read 4 complex values at a time from source B buffer */ B1 = vld1q_f32(pSrcB); /* increment source B buffer by 8 */ pSrcB += 4u; B2 = vld1q_f32(pSrcB); pSrcB += 4u; /* Step 2: Unzip data Out1, Out2 */ /* Unzip data */ out1 = vuzpq_f32(A1, A2); out2 = vuzpq_f32(B1, B2); /* Step 3: Compute data C1,C2,C3,C4 */ /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */ /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */ /* multiply 4 samples at a time from A1 real input with B1 real input */ C1 = vmulq_f32(out1.val[0], out2.val[0]); /* multiply 4 samples at a time from A1 imaginary input with B1 imaginary input */ C2 = vmulq_f32(out1.val[1], out2.val[1]); /* multiply 4 samples at a time from A1 imaginary input with B1 real input */ C3 = vmulq_f32(out1.val[1], out2.val[0]); /* multiply 4 samples at a time from A1 real input with B1 imaginary input */ C4 = vmulq_f32(out1.val[0], out2.val[1]); /* Step 4: Output or accumlate data for group d:*/ /* subtract 4 samples at time from real result to imaginary result */ out1.val[0] = vsubq_f32(C1, C2); /* add real*imaginary result with imaginary*real result 4 at a time */ out1.val[1] = vaddq_f32(C3, C4); acc1.val[0] = vaddq_f32(out1.val[0], acc1.val[0]); /* add by Hank */ acc1.val[1] = vaddq_f32(out1.val[1], acc1.val[1]); /* add by Hank */ //out1 = vzipq_f32(out1.val[0], out1.val[1]); /* Step 5: Store data */ /* Store 4 complex samples to destination buffer */ //vst1q_f32(pDst, out1.val[0]); //pDst += 4u; //vst1q_f32(pDst, out1.val[1]); //pDst += 4u; /* Decrement the numSamples loop counter */ blkCnt--; } blkCnt = numSamples & 3u; /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No intrinsics is used. */ sum_real =0; sum_img =0; while(blkCnt > 0u) { /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */ /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */ a = *pSrcA++; b = *pSrcA++; c = *pSrcB++; d = *pSrcB++; /* store the result in the destination buffer. */ sum_real += ((a * c) - (b * d)); sum_img += ((a * d) + (b * c)); /* Decrement the numSamples loop counter */ blkCnt--; } /* add 4 accumulators */ acc1.val[0] = vaddq_f32(acc1.val[0], acc2.val[0]); acc1.val[1] = vaddq_f32(acc1.val[1], acc2.val[1]); acc2.val[0] = vaddq_f32(acc3.val[0], acc4.val[0]); acc2.val[1] = vaddq_f32(acc3.val[1], acc4.val[1]); acc1.val[0] = vaddq_f32(acc1.val[0], acc2.val[0]); acc1.val[1] = vaddq_f32(acc1.val[1], acc2.val[1]); sum_real += vgetq_lane_f32(acc1.val[0], 0) + vgetq_lane_f32(acc1.val[0], 1) + vgetq_lane_f32(acc1.val[0], 2) + vgetq_lane_f32(acc1.val[0], 3); sum_img += vgetq_lane_f32(acc1.val[1], 0) + vgetq_lane_f32(acc1.val[1], 1) + vgetq_lane_f32(acc1.val[1], 2) + vgetq_lane_f32(acc1.val[1], 3); *pDst++=sum_real; *pDst++=sum_img;
/* useful when debuggin.. */ void print4(float32x4_t v) { /* float *p = (float*)&v; */ /* Commented out to avoid compiler warning of unused variable */ printf("[%13.8g, %13.8g, %13.8g, %13.8g]", vgetq_lane_f32(v,0), vgetq_lane_f32(v, 1), vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)); }
float32_t test_vgetq_lane_f32(float32x4_t a) { // CHECK-LABEL: test_vgetq_lane_f32: // CHECK-NEXT: mov s0, v0[3] // CHECK-NEXT: ret return vgetq_lane_f32(a, 3); }
/* useful when debuggin.. */ void print4(float32x4_t v) { float *p = (float*)&v; printf("[%13.8g, %13.8g, %13.8g, %13.8g]", vgetq_lane_f32(v,0), vgetq_lane_f32(v, 1), vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)); }
/******************************************************************************* * PROCEDURE: gaussian_smooth * PURPOSE: Blur an image with a gaussian filter. * NAME: Mike Heath * DATE: 2/15/96 *******************************************************************************/ short int* gaussian_smooth(unsigned char *image, int rows, int cols, float sigma) { int r, c, rr, cc, /* Counter variables. */ windowsize, /* Dimension of the gaussian kernel. */ center; /* Half of the windowsize. */ float *tempim,*tempim1, /* Buffer for separable filter gaussian smoothing. */ *kernel, /* A one dimensional gaussian kernel. */ dot, /* Dot product summing variable. */ sum; /* Sum of the kernel weights variable. */ /**************************************************************************** * Create a 1-dimensional gaussian smoothing kernel. ****************************************************************************/ if(VERBOSE) printf(" Computing the gaussian smoothing kernel.\n"); make_gaussian_kernel(sigma, &kernel, &windowsize); center = windowsize / 2; /**************************************************************************** * Allocate a temporary buffer image and the smoothed image. ****************************************************************************/ if((tempim = (float *) malloc(rows*cols* sizeof(float))) == NULL) { fprintf(stderr, "Error allocating the buffer image.\n"); exit(1); } short int* smoothedim; if(((smoothedim) = (short int *) malloc(rows*cols*sizeof(short int))) == NULL) { fprintf(stderr, "Error allocating the smoothed image.\n"); exit(1); } startTimer(&totalTime); //Neon impelementation of gaussian smooth starts here /**************************************************************************** * Blur in the x - direction. ****************************************************************************/ int loop; int floop; //Modification of input image for neon implementation //For Filter 1 float * new_image; //For Filter 2 float *new_image_col; //kernel is changed to 17 from 15 for neon (two 0s at the beginning and the end) float new_kernel[17]; //Generating now kernel filter for (floop = 0 ; floop < 17 ; floop++) { if(floop == 0 || floop == 16 ) new_kernel[floop] = 0 ; else new_kernel [floop] = kernel[floop -1]; } //For filter 1, new cols number for neon unsigned int new_cols; new_cols=cols+16; unsigned int i, k; unsigned int a; unsigned int m; unsigned int n, j; //Malloc of new image used by neon new_image = (float*)malloc(new_cols*rows*sizeof(float)); for( i =0; i<rows; i++){ memset(&new_image[i*new_cols],0,8*sizeof(float)); for( k=0; k<cols;k++){ new_image[i*new_cols+8+k] = (float)image[i*cols+k]; } memset(&new_image[i*new_cols+8+cols],0,8*sizeof(float)); } // Neon handles four piexel at a time float32x4_t neon_input; float32x4_t neon_filter; float32x4_t temp_sum; float32x2_t tempUpper; float32x2_t tempLower; float32_t zero = 0; float32_t temp_output; float Basekernel = 0.0f; float kernelSum; //When using the new filter, we always assume the image has more than 9 pixels in a row //Base sum for the filter for( a=8; a<=16; a++){ Basekernel += new_kernel[a]; } //Filter 1, filtering row by row for(m=0; m<rows; m++){ for( n=0; n<cols; n++){ temp_sum = vdupq_n_f32(0); if(n==0){ kernelSum = Basekernel; } else if(n <=8){ kernelSum += new_kernel[8-n]; } else if(n>=cols-8){ kernelSum -=new_kernel[cols-n+8]; } //For each pixel, filtering is performed four times for( j=0; j<4; j++) { int kk=0; if(j>=2) { kk=1; } neon_input = vld1q_f32(&new_image[m*new_cols+n+j*4+kk]); neon_filter = vld1q_f32(&new_kernel[j*4+kk]); temp_sum = vmlaq_f32(temp_sum,neon_input,neon_filter); } unsigned int t; for( t=0; t<=3; t++){ temp_output += vgetq_lane_f32(temp_sum,t ); } temp_output += new_image[m*new_cols+n+8] * new_kernel[8]; temp_output /= kernelSum; tempim[m*cols+n] = temp_output; temp_output=0; } } for(r=0; r<rows; r++) { for(c=0; c<cols; c++) { dot = 0.0; sum = 0.0; for(cc=(-center); cc<=center; cc++) { if(((c+cc) >= 0) && ((c+cc) < cols)) { dot += (float)image[r*cols+(c+cc)] * kernel[center+cc]; sum += kernel[center+cc]; } } tempim1[r*cols+c] = dot/sum; } } /**************************************************************************** * Blur in the y - direction. ****************************************************************************/ unsigned int new_rows; new_rows=rows+16; new_image_col = (float*)malloc(new_rows*cols*sizeof(float)); if(VERBOSE) printf(" Bluring the image in the Y-direction.\n"); for( i =0; i<cols; i++){//actually nember of new rows are the number of columns here memset(&new_image_col[i*new_rows],0,8*sizeof(float)); for( k=0; k<rows;k++){ new_image_col[i*new_rows+8+k] = tempim[k*cols+i]; //new_image_col[i*new_rows+8+k] = imagetest1[k*cols+i]; } memset(&new_image_col[i*new_rows+8+rows],0,8*sizeof(float)); } Basekernel = 0.0; for( a=8; a<=16; a++){ Basekernel += new_kernel[a]; } for(m=0; m<cols; m++){// it was rows at br for( n=0; n<rows; n++){ temp_sum = vdupq_n_f32(0); if(n==0){ kernelSum = Basekernel; } else if(n <=8){ kernelSum += new_kernel[8-n]; } else if(n>=rows-8){ kernelSum -=new_kernel[rows-n+8]; } for( j=0; j<4; j++) { int kk=0; if(j>=2) { kk=1; } neon_input = vld1q_f32(&new_image_col[m*new_rows+n+j*4+kk]); neon_filter = vld1q_f32(&new_kernel[j*4+kk]); temp_sum = vmlaq_f32(temp_sum,neon_input,neon_filter); } unsigned int t; for( t=0; t<=3; t++){ temp_output += vgetq_lane_f32(temp_sum,t ); } temp_output += new_image_col[m*new_rows+n+8] * new_kernel[8]; temp_output = (temp_output * BOOSTBLURFACTOR) / kernelSum + 0.5; smoothedim[n*cols+m] = (short int )temp_output; temp_output=0; } } stopTimer(&totalTime); printTimer(&totalTime); free(tempim); free(kernel); return smoothedim; }
// CHECK-LABEL: define float @test_vgetq_lane_f32(<4 x float> %a) #0 { // CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> // CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> // CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 // CHECK: ret float [[VGETQ_LANE]] float32_t test_vgetq_lane_f32(float32x4_t a) { return vgetq_lane_f32(a, 3); }