void test_vgetQ_laneu16 (void) { uint16_t out_uint16_t; uint16x8_t arg0_uint16x8_t; out_uint16_t = vgetq_lane_u16 (arg0_uint16x8_t, 1); }
/* u16x8 mv mul */ void mw_neon_mv_mul_u16x8(unsigned short * A, int Row, int T, unsigned short * B, unsigned short * C) { int i = 0; int k = 0; uint16x8_t neon_b, neon_c; uint16x8_t neon_a0, neon_a1, neon_a2, neon_a3, neon_a4, neon_a5, neon_a6, neon_a7; uint16x8_t neon_b0, neon_b1, neon_b2, neon_b3, neon_b4, neon_b5, neon_b6, neon_b7; for (i = 0; i < Row; i+=8) { neon_c = vmovq_n_u16(0); for (k = 0; k < T; k+=8) { int j = k * T + i; neon_a0 = vld1q_u16(A + j); j+=Row; neon_a1 = vld1q_u16(A + j); j+=Row; neon_a2 = vld1q_u16(A + j); j+=Row; neon_a3 = vld1q_u16(A + j); j+=Row; neon_a4 = vld1q_u16(A + j); j+=Row; neon_a5 = vld1q_u16(A + j); j+=Row; neon_a6 = vld1q_u16(A + j); j+=Row; neon_a7 = vld1q_u16(A + j); neon_b = vld1q_u16(B + k); neon_b0 = vdupq_n_u16(vgetq_lane_u16(neon_b, 0)); neon_b1 = vdupq_n_u16(vgetq_lane_u16(neon_b, 1)); neon_b2 = vdupq_n_u16(vgetq_lane_u16(neon_b, 2)); neon_b3 = vdupq_n_u16(vgetq_lane_u16(neon_b, 3)); neon_b4 = vdupq_n_u16(vgetq_lane_u16(neon_b, 4)); neon_b5 = vdupq_n_u16(vgetq_lane_u16(neon_b, 5)); neon_b6 = vdupq_n_u16(vgetq_lane_u16(neon_b, 6)); neon_b7 = vdupq_n_u16(vgetq_lane_u16(neon_b, 7)); neon_c = vaddq_u16(vmulq_u16(neon_a0, neon_b0), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a1, neon_b1), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a2, neon_b2), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a3, neon_b3), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a4, neon_b4), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a5, neon_b5), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a6, neon_b6), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a7, neon_b7), neon_c); } vst1q_u16(C + i, neon_c); } }
uint16_t test_vgetq_lane_u16(uint16x8_t v1) { // CHECK: test_vgetq_lane_u16 return vgetq_lane_u16(v1, 6); // CHECK: umov {{w[0-9]+}}, {{v[0-9]+}}.h[6] }
/** * Get the sum of absolute differences for a specific pixel location and disparity * * @param leftImage left image * @param rightImage right image * @param laplacianL laplacian-fitlered left image * @param laplacianR laplacian-filtered right image * @param pxX row pixel location * @param pxY column pixel location * @param state state structure that includes a number of parameters * @param left_interest optional parameter that will be filled with the value for the left interest operation * @param right_interest same as above, for the right image * * @retval scaled sum of absolute differences for this block -- * the value is the sum/numberOfPixels */ int PushbroomStereo::GetSAD(Mat leftImage, Mat rightImage, Mat laplacianL, Mat laplacianR, int pxX, int pxY, int *left_interest, int *right_interest, int *raw_sad) { // top left corner of the SAD box int startX = pxX; int startY = pxY; // bottom right corner of the SAD box #ifndef USE_NEON int endX = pxX + m_iBlockSize - 1; #endif int endY = pxY + m_iBlockSize - 1; #if USE_SAFTEY_CHECKS int flag = false; if (startX < 0) { printf("Warning: startX < 0\n"); flag = true; } if (endX > rightImage.cols) { printf("Warning: endX > leftImage.cols\n"); flag = true; } if (startX + disparity < 0) { printf("Warning: startX + disparity < 0\n"); flag = true; } if (endX + disparity > rightImage.cols) { printf("Warning: endX + disparity > leftImage.cols\n"); flag = true; } if (endX + disparity > rightImage.cols) { printf("Warning: endX + disparity > rightImage.cols\n"); endX = rightImage.cols - disparity; flag = true; } if (startY < 0) { printf("Warning: startY < 0\n"); flag = true; } if (endY > rightImage.rows) { printf("Warning: endY > rightImage.rows\n"); flag = true; } // disparity might be negative as well if (disparity < 0 && startX + disparity < 0) { printf("Warning: disparity < 0 && startX + disparity < 0\n"); startX = -disparity; flag = true; } if (flag == true) { printf("startX = %d, endX = %d, disparity = %d, startY = %d, endY = %d\n", startX, endX, disparity, startY, endY); } startX = max(0, startX); startY = max(0, startY); endX = min(leftImage.cols - disparity, endX); endY = min(leftImage.rows, endY); #endif int leftVal = 0, rightVal = 0; int sad = 0; #ifdef USE_NEON uint16x8_t interest_op_sum_8x_L, interest_op_sum_8x_R, sad_sum_8x; // load zeros into everything interest_op_sum_8x_L = vdupq_n_u16(0); interest_op_sum_8x_R = vdupq_n_u16(0); sad_sum_8x = vdupq_n_u16(0); #endif for (int i=startY;i<=endY;i++) { if(i>=leftImage.rows-1) continue; //Get a pointer for this row uchar *this_rowL = leftImage.ptr<uchar>(i); uchar *this_rowR = rightImage.ptr<uchar>(i); uchar *this_row_laplacianL = laplacianL.ptr<uchar>(i); uchar *this_row_laplacianR = laplacianR.ptr<uchar>(i); #ifdef USE_NEON // load this row into memory uint8x8_t this_row_8x8_L = vld1_u8(this_rowL + startX); uint8x8_t this_row_8x8_R = vld1_u8(this_rowR + startX + disparity); uint8x8_t interest_op_8x8_L = vld1_u8(this_row_laplacianL + startX); uint8x8_t interest_op_8x8_R = vld1_u8(this_row_laplacianR + startX + disparity); // do absolute differencing for the entire row in one operation! uint8x8_t sad_8x = vabd_u8(this_row_8x8_L, this_row_8x8_R); // sum up sad_sum_8x = vaddw_u8(sad_sum_8x, sad_8x); // sum laplacian values interest_op_sum_8x_L = vaddw_u8(interest_op_sum_8x_L, interest_op_8x8_L); interest_op_sum_8x_R = vaddw_u8(interest_op_sum_8x_R, interest_op_8x8_R); #else // USE_NEON for (int j=startX;j<=endX;j++) { // we are now looking at a single pixel value /*uchar pxL = leftImage.at<uchar>(i,j); uchar pxR = rightImage.at<uchar>(i,j + disparity); uchar sL = laplacianL.at<uchar>(i,j); uchar sR = laplacianR.at<uchar>(i,j + disparity); */ uchar sL = this_row_laplacianL[j];//laplacianL.at<uchar>(i,j); uchar sR = this_row_laplacianR[j + m_iDisparity]; //laplacianR.at<uchar>(i,j + disparity); leftVal += sL; rightVal += sR; uchar pxL = this_rowL[j]; uchar pxR = this_rowR[j + m_iDisparity]; sad += abs(pxL - pxR); } #endif // USE_NEON } #ifdef USE_NEON // sum up sad = vgetq_lane_u16(sad_sum_8x, 0) + vgetq_lane_u16(sad_sum_8x, 1) + vgetq_lane_u16(sad_sum_8x, 2) + vgetq_lane_u16(sad_sum_8x, 3) + vgetq_lane_u16(sad_sum_8x, 4);// + vgetq_lane_u16(sad_sum_8x, 5) // + vgetq_lane_u16(sad_sum_8x, 6) + vgetq_lane_u16(sad_sum_8x, 7); leftVal = vgetq_lane_u16(interest_op_sum_8x_L, 0) + vgetq_lane_u16(interest_op_sum_8x_L, 1) + vgetq_lane_u16(interest_op_sum_8x_L, 2) + vgetq_lane_u16(interest_op_sum_8x_L, 3) + vgetq_lane_u16(interest_op_sum_8x_L, 4); rightVal = vgetq_lane_u16(interest_op_sum_8x_R, 0) + vgetq_lane_u16(interest_op_sum_8x_R, 1) + vgetq_lane_u16(interest_op_sum_8x_R, 2) + vgetq_lane_u16(interest_op_sum_8x_R, 3) + vgetq_lane_u16(interest_op_sum_8x_R, 4); #endif //cout << "(" << leftVal << ", " << rightVal << ") vs. (" << leftVal2 << ", " << rightVal2 << ")" << endl; int laplacian_value = leftVal + rightVal; int fThresh = 200; if((leftVal<fThresh)||(rightVal<fThresh)) laplacian_value /= 10; //cout << "sad with neon: " << sad << " without neon: " << sad2 << endl; if (left_interest != NULL) *left_interest = leftVal; if (right_interest != NULL) *right_interest = rightVal; // percentage of total interest value that is different //float diff_score = 100*(float)abs(leftVal - rightVal)/(float)laplacian_value; if (raw_sad != NULL) *raw_sad = sad; if (leftVal < m_iSobelLimit || rightVal < m_iSobelLimit)// || diff_score > state.interest_diff_limit) return -1; // weight laplacian_value into the score //return sobel; return NUMERIC_CONST*(float)sad/(float)laplacian_value; }
uint16_t test_vgetq_lane_u16(uint16x8_t a) { // CHECK-LABEL: test_vgetq_lane_u16: // CHECK-NEXT: umov.h w0, v0[7] // CHECK-NEXT: ret return vgetq_lane_u16(a, 7); }
/* u16x8 mm mul */ void mw_neon_mm_mul_u16x8(unsigned short * A, int Row, int T, unsigned short * B, int Col, unsigned short * C) { int i, k, j; uint16x8_t neon_b, neon_c; uint16x8_t neon_a0, neon_a1, neon_a2, neon_a3, neon_a4, neon_a5, neon_a6, neon_a7; uint16x8_t neon_b0, neon_b1, neon_b2, neon_b3, neon_b4, neon_b5, neon_b6, neon_b7; for (i = 0; i < Row; i+=8) { for (k = 0; k < Col; k+=1) { neon_c = vmovq_n_u16(0); for (j = 0; j < T; j+=8) { int j_T = j * T + i; int k_Row = k * Row; neon_a0 = vld1q_u16(A + j_T); j_T+=Row; neon_a1 = vld1q_u16(A + j_T); j_T+=Row; neon_a2 = vld1q_u16(A + j_T); j_T+=Row; neon_a3 = vld1q_u16(A + j_T); j_T+=Row; neon_a4 = vld1q_u16(A + j_T); j_T+=Row; neon_a5 = vld1q_u16(A + j_T); j_T+=Row; neon_a6 = vld1q_u16(A + j_T); j_T+=Row; neon_a7 = vld1q_u16(A + j_T); neon_b = vld1q_u16(B + k_Row + j); neon_b0 = vdupq_n_u16(vgetq_lane_u16(neon_b, 0)); neon_b1 = vdupq_n_u16(vgetq_lane_u16(neon_b, 1)); neon_b2 = vdupq_n_u16(vgetq_lane_u16(neon_b, 2)); neon_b3 = vdupq_n_u16(vgetq_lane_u16(neon_b, 3)); neon_b4 = vdupq_n_u16(vgetq_lane_u16(neon_b, 4)); neon_b5 = vdupq_n_u16(vgetq_lane_u16(neon_b, 5)); neon_b6 = vdupq_n_u16(vgetq_lane_u16(neon_b, 6)); neon_b7 = vdupq_n_u16(vgetq_lane_u16(neon_b, 7)); neon_c = vaddq_u16(vmulq_u16(neon_a0, neon_b0), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a1, neon_b1), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a2, neon_b2), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a3, neon_b3), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a4, neon_b4), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a5, neon_b5), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a6, neon_b6), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a7, neon_b7), neon_c); vst1q_lane_u16(C + k_Row + i, neon_c, 0); vst1q_lane_u16(C + k_Row + i + 1, neon_c, 1); vst1q_lane_u16(C + k_Row + i + 2, neon_c, 2); vst1q_lane_u16(C + k_Row + i + 3, neon_c, 3); vst1q_lane_u16(C + k_Row + i + 4, neon_c, 4); vst1q_lane_u16(C + k_Row + i + 5, neon_c, 5); vst1q_lane_u16(C + k_Row + i + 6, neon_c, 6); vst1q_lane_u16(C + k_Row + i + 7, neon_c, 7); } } } }
static inline int setbits(uint16x8_t d){ return (int) (vgetq_lane_u16 (d,0)+ vgetq_lane_u16 (d,1)+ vgetq_lane_u16 (d,2)+ vgetq_lane_u16 (d,3)+ vgetq_lane_u16 (d,4)+ vgetq_lane_u16 (d,5)+ vgetq_lane_u16 (d,6)+ vgetq_lane_u16 (d,7) ); }
// CHECK-LABEL: define i16 @test_vgetq_lane_u16(<8 x i16> %a) #0 { // CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> // CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> // CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 // CHECK: ret i16 [[VGETQ_LANE]] uint16_t test_vgetq_lane_u16(uint16x8_t a) { return vgetq_lane_u16(a, 7); }
uint16_t test_vgetq_lane_u16_before (uint16x8_t in) { /* { dg-error "lane -1 out of range 0 - 7" "" {target *-*-*} 0 } */ return vgetq_lane_u16 (in, -1); }
uint16_t test_vgetq_lane_u16_beyond (uint16x8_t in) { /* { dg-error "lane 8 out of range 0 - 7" "" {target *-*-*} 0 } */ return vgetq_lane_u16 (in, 8); }