void Shuffle16Elems(__m128 &io_data0, __m128 &io_data1, __m128 &io_data2, __m128 &io_data3) { __m128 ccdd1 = _mm_unpackhi_ps(io_data0, io_data1); __m128 ccdd2 = _mm_unpackhi_ps(io_data2, io_data3); __m128 aabb1 = _mm_unpacklo_ps(io_data0, io_data1); __m128 aabb2 = _mm_unpacklo_ps(io_data2, io_data3); io_data0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(aabb1), _mm_castps_pd(aabb2))); io_data1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(aabb1), _mm_castps_pd(aabb2))); io_data2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(ccdd1), _mm_castps_pd(ccdd2))); io_data3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(ccdd1), _mm_castps_pd(ccdd2))); }
double bst_compute_123_m128_unaligned8_maskstore( void*_bst_obj, double* p, double* q, size_t nn ) { segments_t* mem = (segments_t*) _bst_obj; int n, i, r, l_end, j, l_end_pre; double t, e_tmp; double* e = mem->e, *w = mem->w; int* root = mem->r; __m128d v_tmp; __m128d v00, v01, v02, v03; __m128d v10, v11, v12, v13; __m128d v20, v21, v22, v23; __m128d v30, v31, v32, v33; __m128i v_cur_roots; __m128 v_rootmask0, v_rootmask1; // initialization // mem->n = nn; n = nn; // subtractions with n potentially negative. say hello to all the bugs int idx1, idx2, idx3; idx1 = IDX(n,n); e[idx1] = q[n]; idx1++; for (i = n-1; i >= 0; --i) { idx1 -= 2*(n-i)+1; idx2 = idx1 + 1; e[idx1] = q[i]; w[idx1] = q[i]; for (j = i+1; j < n+1; ++j,++idx2) { e[idx2] = INFINITY; w[idx2] = w[idx2-1] + p[j-1] + q[j]; } idx3 = idx1; for (r = i; r < n; ++r) { // idx2 = IDX(r+1, r+1); idx1 = idx3; l_end = idx2 + (n-r); // l_end points to the first entry after the current row e_tmp = e[idx1++]; // calculate until a multiple of 8 doubles is left // 8 = 4 * 2 128-bit vectors l_end_pre = idx2 + ((n-r)&7); for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) { t = e_tmp + e[idx2] + w[idx1]; if (t < e[idx1]) { e[idx1] = t; root[idx1] = r; } idx1++; } v_tmp = _mm_set_pd( e_tmp, e_tmp ); // execute the shit for 4 vectors of size 2 v_cur_roots = _mm_set_epi32(r, r, r, r); for( ; idx2 < l_end; idx2 += 8 ) { v01 = _mm_loadu_pd( &w[idx1 ] ); v11 = _mm_loadu_pd( &w[idx1+2] ); v21 = _mm_loadu_pd( &w[idx1+4] ); v31 = _mm_loadu_pd( &w[idx1+6] ); v00 = _mm_loadu_pd( &e[idx2 ] ); v01 = _mm_add_pd( v01, v_tmp ); v10 = _mm_loadu_pd( &e[idx2+2] ); v11 = _mm_add_pd( v11, v_tmp ); v20 = _mm_loadu_pd( &e[idx2+4] ); v21 = _mm_add_pd( v21, v_tmp ); v30 = _mm_loadu_pd( &e[idx2+6] ); v31 = _mm_add_pd( v31, v_tmp ); v01 = _mm_add_pd( v01, v00 ); v03 = _mm_loadu_pd( &e[idx1 ] ); v11 = _mm_add_pd( v11, v10 ); v13 = _mm_loadu_pd( &e[idx1+2] ); v21 = _mm_add_pd( v21, v20 ); v23 = _mm_loadu_pd( &e[idx1+4] ); v31 = _mm_add_pd( v31, v30 ); v33 = _mm_loadu_pd( &e[idx1+6] ); v02 = _mm_cmplt_pd( v01, v03 ); v12 = _mm_cmplt_pd( v11, v13 ); v22 = _mm_cmplt_pd( v21, v23 ); v32 = _mm_cmplt_pd( v31, v33 ); _mm_maskstore_pd( &e[idx1 ], _mm_castpd_si128( v02 ), v01 ); _mm_maskstore_pd( &e[idx1+2], _mm_castpd_si128( v12 ), v11 ); _mm_maskstore_pd( &e[idx1+4], _mm_castpd_si128( v22 ), v21 ); _mm_maskstore_pd( &e[idx1+6], _mm_castpd_si128( v32 ), v31 ); v_rootmask0 = _mm_shuffle_ps( _mm_castpd_ps( v02 ), _mm_castpd_ps( v12 ), _MM_SHUFFLE(0,2,0,2) ); v_rootmask1 = _mm_shuffle_ps( _mm_castpd_ps( v12 ), _mm_castpd_ps( v22 ), _MM_SHUFFLE(0,2,0,2) ); _mm_maskstore_ps( &root[idx1], _mm_castps_si128( v_rootmask0 ), _mm_castsi128_ps( v_cur_roots ) ); _mm_maskstore_ps( &root[idx1+4], _mm_castps_si128( v_rootmask1 ), _mm_castsi128_ps( v_cur_roots ) ); idx1 += 8; } idx3++; } } return e[IDX(0,n)]; }
static inline void sacEvaluateModelSPRT(PROSAC_HEST* p){ unsigned i; unsigned isInlier; double lambda = 1.0; double lambdaReject = ((1.0 - p->delta) / (1.0 - p->epsilon)); double lambdaAccept = (( p->delta ) / ( p->epsilon )); float distSq = p->maxD*p->maxD; float* src = (float*)p->src; float* dst = (float*)p->dst; float* H = p->H; p->inl = 0; p->N_tested = 0; p->good = 1; /* VECTOR */ const __m128 distSqV=_mm_set1_ps(distSq); const __m128 H00=_mm_set1_ps(H[0]); const __m128 H01=_mm_set1_ps(H[1]); const __m128 H02=_mm_set1_ps(H[2]); const __m128 H10=_mm_set1_ps(H[4]); const __m128 H11=_mm_set1_ps(H[5]); const __m128 H12=_mm_set1_ps(H[6]); const __m128 H20=_mm_set1_ps(H[8]); const __m128 H21=_mm_set1_ps(H[9]); const __m128 H22=_mm_set1_ps(H[10]); for(i=0;i<(p->N-3) && p->good;i+=4){ /* Backproject */ __m128 x, y, X, Y, inter0, inter1, inter2, inter3; x=_mm_load_ps(src+2*i); y=_mm_load_ps(src+2*i+4); X=_mm_load_ps(dst+2*i); Y=_mm_load_ps(dst+2*i+4); inter0=_mm_unpacklo_ps(x,y);// y1 y0 x1 x0 inter1=_mm_unpackhi_ps(x,y);// y3 y2 x3 x2 inter2=_mm_unpacklo_ps(X,Y);// Y1 Y0 X1 X0 inter3=_mm_unpackhi_ps(X,Y);// Y3 Y2 X3 X2 x=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1))); y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1))); X=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3))); Y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3))); __m128 reprojX = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H00, x), _mm_mul_ps(H01, y)), H02); __m128 reprojY = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H10, x), _mm_mul_ps(H11, y)), H12); __m128 reprojZ = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H20, x), _mm_mul_ps(H21, y)), H22); __m128 recipZ = _mm_rcp_ps(reprojZ); reprojX = _mm_mul_ps(reprojX, recipZ); reprojY = _mm_mul_ps(reprojY, recipZ); //reprojX = _mm_div_ps(reprojX, reprojZ); //reprojY = _mm_div_ps(reprojY, reprojZ); reprojX = _mm_sub_ps(reprojX, X); reprojY = _mm_sub_ps(reprojY, Y); reprojX = _mm_mul_ps(reprojX, reprojX); reprojY = _mm_mul_ps(reprojY, reprojY); __m128 reprojDistV = _mm_add_ps(reprojX, reprojY); __m128 cmp = _mm_cmple_ps(reprojDistV, distSqV); int msk = _mm_movemask_ps(cmp); /* ... */ /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15*/ unsigned bitCnt[] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; p->inl += bitCnt[msk]; /* SPRT */ lambda *= p->lambdaTBL[msk]; p->good = lambda <= p->A; /* If !p->good, the threshold A was exceeded, so we're rejecting */ } /* SCALAR */ for(;i<p->N && p->good;i++){ /* Backproject */ float x=src[i*2],y=src[i*2+1]; float X=dst[i*2],Y=dst[i*2+1]; float reprojX=H[0]*x+H[1]*y+H[2]; // ( X_1 ) ( H_11 H_12 H_13 ) (x_1) float reprojY=H[4]*x+H[5]*y+H[6]; // ( X_2 ) = ( H_21 H_22 H_23 ) (x_2) float reprojZ=H[8]*x+H[9]*y+H[10];// ( X_3 ) ( H_31 H_32 H_33=1.0 ) (x_3 = 1.0) //reproj is in homogeneous coordinates. To bring back to "regular" coordinates, divide by Z. reprojX/=reprojZ; reprojY/=reprojZ; //Compute distance reprojX-=X; reprojY-=Y; reprojX*=reprojX; reprojY*=reprojY; float reprojDist = reprojX+reprojY; /* ... */ isInlier = reprojDist <= distSq; p->inl += isInlier; /* SPRT */ lambda *= isInlier ? lambdaAccept : lambdaReject; p->good = lambda <= p->A; /* If !p->good, the threshold A was exceeded, so we're rejecting */ } p->N_tested = i; }
double bst_compute_121_m128_aligned4( void*_bst_obj, double* p, double* q, size_t nn ) { segments_t* mem = (segments_t*) _bst_obj; int n, i, r, l_end, l_end_pre, j; double t, e_tmp; double* e = mem->e, *w = mem->w; int* root = mem->r; __m128d v_tmp; __m128d v00, v01, v02, v03; __m128d v10, v11, v12, v13; __m128i v_cur_roots, v_old_roots, v_new_roots; __m128 v_rootmask; // initialization // mem->n = nn; n = nn; // subtractions with n potentially negative. say hello to all the bugs int idx1, idx2, idx3, pad, pad_r; idx1 = (n+1)*(n+2)/2 + n/2; e[idx1] = q[n]; idx1++; pad = 1; // pad contains the padding for row i+1 // for row n it's always 1 for (i = n-1; i >= 0; --i) { idx1 -= 2*(n-i)+1 + pad; idx2 = idx1 + 1; e[idx1] = q[i]; w[idx1] = q[i]; for (j = i+1; j < n+1; ++j,++idx2) { e[idx2] = INFINITY; w[idx2] = w[idx2-1] + p[j-1] + q[j]; } // idx2 now points to the beginning of the next line. idx2 += pad; // padding of line i+1 idx3 = idx1; pad_r = pad; // padding of line r for (r = i; r < n; ++r) { pad_r = !pad_r; // padding of line r+1 // idx2 = IDX(r+1, r+1); idx1 = idx3; l_end = idx2 + (n-r); e_tmp = e[idx1++]; // calculate until a multiple of 8 doubles is left // 8 = 4 * 2 128-bit vectors l_end_pre = idx2 + ((n-r)&3); for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) { t = e_tmp + e[idx2] + w[idx1]; if (t < e[idx1]) { e[idx1] = t; root[idx1] = r; } idx1++; } v_tmp = _mm_set_pd( e_tmp, e_tmp ); // execute the shit for 4 vectors of size 2 v_cur_roots = _mm_set_epi32(r, r, r, r); for( ; idx2 < l_end; idx2 += 4 ) { v01 = _mm_load_pd( &w[idx1 ] ); v11 = _mm_load_pd( &w[idx1+2] ); v00 = _mm_load_pd( &e[idx2 ] ); v01 = _mm_add_pd( v01, v_tmp ); // supoptimal for raw-dependency v10 = _mm_load_pd( &e[idx2+2] ); v11 = _mm_add_pd( v11, v_tmp ); v01 = _mm_add_pd( v01, v00 ); v03 = _mm_load_pd( &e[idx1 ] ); v11 = _mm_add_pd( v11, v10 ); v13 = _mm_load_pd( &e[idx1+2] ); v02 = _mm_cmplt_pd( v01, v03 ); v12 = _mm_cmplt_pd( v11, v13 ); v00 = _mm_or_pd( _mm_and_pd( v02, v01 ), _mm_andnot_pd( v02, v03 )); v10 = _mm_or_pd( _mm_and_pd( v12, v11 ), _mm_andnot_pd( v12, v13 )); _mm_store_pd( &e[idx1 ], v00 ); _mm_store_pd( &e[idx1+2], v10 ); v_rootmask = _mm_shuffle_ps( _mm_castpd_ps( v02 ), _mm_castpd_ps( v12 ), _MM_SHUFFLE(0,2,0,2) ); v_old_roots = _mm_lddqu_si128( &root[idx1] ); v_new_roots = _mm_or_si128( _mm_and_si128( v_cur_roots, _mm_castps_si128( v_rootmask ) ), _mm_andnot_si128( v_old_roots, _mm_castps_si128( v_rootmask ) ) ); _mm_storeu_si128( &root[idx1], v_new_roots ); idx1 += 4; } idx2 += pad_r; idx3++; } pad = !pad; // every other line as padding 0, or 1, respectively } // if n is even, the total number of entries in the first // row of the table is odd, so we need padding return e[n + !(n&1)]; }