int64_t av1_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bps) { int i, j, test; uint32_t temp[4]; __m128i max, min, cmp0, cmp1, cmp2, cmp3; int64_t error = 0, sqcoeff = 0; const int shift = 2 * (bps - 8); const int rounding = shift > 0 ? 1 << (shift - 1) : 0; for (i = 0; i < block_size; i += 8) { // Load the data into xmm registers __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i)); __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4)); __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i)); __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4)); // Check if any values require more than 15 bit max = _mm_set1_epi32(0x3fff); min = _mm_set1_epi32(0xffffc000); cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max), _mm_cmplt_epi32(mm_coeff, min)); cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max), _mm_cmplt_epi32(mm_coeff2, min)); cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max), _mm_cmplt_epi32(mm_dqcoeff, min)); cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max), _mm_cmplt_epi32(mm_dqcoeff2, min)); test = _mm_movemask_epi8( _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3))); if (!test) { __m128i mm_diff, error_sse2, sqcoeff_sse2; mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2); mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2); mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff); error_sse2 = _mm_madd_epi16(mm_diff, mm_diff); sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff); _mm_storeu_si128((__m128i *)temp, error_sse2); error = error + temp[0] + temp[1] + temp[2] + temp[3]; _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2); sqcoeff += temp[0] + temp[1] + temp[2] + temp[3]; } else { for (j = 0; j < 8; j++) { const int64_t diff = coeff[i + j] - dqcoeff[i + j]; error += diff * diff; sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j]; } } } assert(error >= 0 && sqcoeff >= 0); error = (error + rounding) >> shift; sqcoeff = (sqcoeff + rounding) >> shift; *ssz = sqcoeff; return error; }
__m128i test_mm_cmplt_epi32(__m128i A, __m128i B) { // DAG-LABEL: test_mm_cmplt_epi32 // DAG: icmp sgt <4 x i32> // // ASM-LABEL: test_mm_cmplt_epi32 // ASM: pcmpgtd return _mm_cmplt_epi32(A, B); }
SIMDValue SIMDInt32x4Operation::OpLessThan(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); x86Result.m128i_value = _mm_cmplt_epi32(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a < b? return X86SIMDValue::ToSIMDValue(x86Result); }
static inline __m128i clamp_signed_byte_SSE2(const __m128i& n) { __m128i cmp1 = _mm_cmplt_epi32(n, _mm_setzero_si128()); __m128i cmp2 = _mm_cmpgt_epi32(n, _mm_set1_epi32(255)); __m128i ret = _mm_and_si128(cmp2, _mm_set1_epi32(255)); __m128i cmp = _mm_or_si128(cmp1, cmp2); ret = _mm_or_si128(_mm_and_si128(cmp, ret), _mm_andnot_si128(cmp, n)); return ret; }
__m128i branchfree_search4_avx(int* source, size_t n, __m128i target) { __m128i offsets = _mm_setzero_si128(); if(n == 0) return offsets; __m128i ha = _mm_set1_epi32(n>>1); while(n>1) { n -= n>>1; __m128i offsetsplushalf = _mm_add_epi32(offsets,ha); ha = _mm_sub_epi32(ha,_mm_srli_epi32(ha,1)); __m128i keys = _mm_i32gather_epi32(source,offsetsplushalf,4); __m128i lt = _mm_cmplt_epi32(keys,target); offsets = _mm_blendv_epi8(offsets,offsetsplushalf,lt); } __m128i lastkeys = _mm_i32gather_epi32(source,offsets,4); __m128i lastlt = _mm_cmplt_epi32(lastkeys,target); __m128i oneswhereneeded = _mm_srli_epi32(lastlt,31); __m128i answer = _mm_add_epi32(offsets,oneswhereneeded); return answer; }
static inline __m128i darken_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { __m128i sd = _mm_mullo_epi16(sc, da); __m128i ds = _mm_mullo_epi16(dc, sa); __m128i cmp = _mm_cmplt_epi32(sd, ds); __m128i tmp = _mm_add_epi32(sc, dc); __m128i ret1 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(ds)); __m128i ret2 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(sd)); __m128i ret = _mm_or_si128(_mm_and_si128(cmp, ret1), _mm_andnot_si128(cmp, ret2)); return ret; }
/* int32_t search_range(Rect rect, int32_t x[], int32_t y[], int32_t w[], int32_t n) { int32_t ret = 0; for (int i = 0; i < n; i++) { if (rect.lx <= x[i] && x[i] <= rect.rx && rect.ly <= y[i] && y[i] <= rect.ry) { ret += w[i]; } } return ret; } */ int32_t search_range(Rect rect, int32_t x[], int32_t y[], int32_t w[], int32_t n) { __m128i ret = _mm_set_epi32(0, 0, 0, 0); rect.lx--, rect.ly--; rect.rx++, rect.ry++; __m128i lx = _mm_broadcastd_epi32(*((__m128i *) &rect.lx)); __m128i ly = _mm_broadcastd_epi32(*((__m128i *) &rect.ly)); __m128i rx = _mm_broadcastd_epi32(*((__m128i *) &rect.rx)); __m128i ry = _mm_broadcastd_epi32(*((__m128i *) &rect.ry)); __m128i zo = _mm_set_epi32(0, 0, 0, 0); __m128i ic = _mm_set_epi32(3, 2, 1, 0); for (int i = 0; i+4 <= n; i += 4) { __m128i sx = _mm_load_si128((__m128i *) (x+i)); __m128i sy = _mm_load_si128((__m128i *) (y+i)); __m128i c1 = _mm_and_si128(_mm_cmplt_epi32(lx, sx), _mm_cmplt_epi32(sx, rx)); __m128i c2 = _mm_and_si128(_mm_cmplt_epi32(ly, sy), _mm_cmplt_epi32(sy, ry)); if (_mm_testz_si128(c1, c2) == 0) { __m128i cc = _mm_and_si128(c1, c2); __m128i vi = _mm_add_epi32(ic, _mm_set_epi32(i, i, i, i)); __m128i rs = _mm_mask_i32gather_epi32(zo, w+i, ic, cc, 4); ret = _mm_add_epi32(ret, rs); } } int32_t sum = 0; for (int i = (n>>2)<<2; i < n; i++) { if (rect.lx <= x[i] && x[i] <= rect.rx && rect.ly <= y[i] && y[i] <= rect.ry) { sum += w[i]; } } static int32_t tmp[4] __attribute__ ((aligned (16))); _mm_store_si128((__m128i*) &tmp[0], ret); sum += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return sum; }
SIMDValue SIMDUint32x4Operation::OpLessThan(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); X86SIMDValue signBits; signBits.m128i_value = _mm_set1_epi32(0x80000000); // Signed comparison of unsigned ints can be done if the ints have the "sign" bit xored with 1 tmpaValue.m128i_value = _mm_xor_si128(tmpaValue.m128i_value, signBits.m128i_value); tmpbValue.m128i_value = _mm_xor_si128(tmpbValue.m128i_value, signBits.m128i_value); x86Result.m128i_value = _mm_cmplt_epi32(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a < b? return X86SIMDValue::ToSIMDValue(x86Result); }
SIMDValue SIMDFloat32x4Operation::OpFromUint32x4(const SIMDValue& value) { X86SIMDValue x86Result, temp1; X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value); // find unsigned values above 2^31-1. Comparison is signed, so look for values < 0 temp1.m128i_value = _mm_cmplt_epi32(v.m128i_value, X86_ALL_ZEROS.m128i_value); // temp1 has f32(2^32) for unsigned values above 2^31, 0 otherwise temp1.m128_value = _mm_and_ps(temp1.m128_value, X86_TWO_32_F4.m128_value); // convert x86Result.m128_value = _mm_cvtepi32_ps(v.m128i_value); // Add f32(2^32) to negative values x86Result.m128_value = _mm_add_ps(x86Result.m128_value, temp1.m128_value); return X86SIMDValue::ToSIMDValue(x86Result); }
static inline __m128i clamp_div255round_SSE2(const __m128i& prod) { // test if > 0 __m128i cmp1 = _mm_cmpgt_epi32(prod, _mm_setzero_si128()); // test if < 255*255 __m128i cmp2 = _mm_cmplt_epi32(prod, _mm_set1_epi32(255*255)); __m128i ret = _mm_setzero_si128(); // if value >= 255*255, value = 255 ret = _mm_andnot_si128(cmp2, _mm_set1_epi32(255)); __m128i div = SkDiv255Round_SSE2(prod); // test if > 0 && < 255*255 __m128i cmp = _mm_and_si128(cmp1, cmp2); ret = _mm_or_si128(_mm_and_si128(cmp, div), _mm_andnot_si128(cmp, ret)); return ret; }
RETi CMPLT(const __m128i x, const __m128i y) { return _mm_cmplt_epi32(x, y); }
void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { int i, j, non_zero_regs = (int)count / 4, eob_i = -1; __m128i zbins[2]; __m128i nzbins[2]; zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[0]); zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]); nzbins[0] = _mm_setzero_si128(); nzbins[1] = _mm_setzero_si128(); nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); (void)scan; memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); if (!skip_block) { // Pre-scan pass for (i = ((int)count / 4) - 1; i >= 0; i--) { __m128i coeffs, cmp1, cmp2; int test; coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); cmp1 = _mm_and_si128(cmp1, cmp2); test = _mm_movemask_epi8(cmp1); if (test == 0xffff) non_zero_regs--; else break; } // Quantization pass: for (i = 0; i < non_zero_regs; i++) { __m128i coeffs, coeffs_sign, tmp1, tmp2; int test; int abs_coeff[4]; int coeff_sign[4]; coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); coeffs_sign = _mm_srai_epi32(coeffs, 31); coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); tmp1 = _mm_or_si128(tmp1, tmp2); test = _mm_movemask_epi8(tmp1); _mm_storeu_si128((__m128i *)abs_coeff, coeffs); _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); for (j = 0; j < 4; j++) { if (test & (1 << (4 * j))) { int k = 4 * i + j; const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; const uint32_t abs_qcoeff = (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; } } } }
int sse_auction_search(int *pr, int *P, int *ai0, int *ai1, int *a0, int *a1, int nodes, int arcs, int s, int t) { int i __attribute__ ((aligned (16))) = 0; int j __attribute__ ((aligned (16))) = t; int k __attribute__ ((aligned (16))) = 0; int m __attribute__ ((aligned (16))) = 0; int maxla __attribute__ ((aligned (32))) = 0; int argmaxla __attribute__ ((aligned (16))) = 0; int cost __attribute__ ((aligned (16))) = 0; int length __attribute__ ((aligned (16))) = 1; int path_cost __attribute__ ((aligned (16))) = 0; uint32_t tmp1, tmp2; int cost_tab[nodes+1]; __m128i a0sse, a1sse, ai0sse, ai1sse, ai1sse1, I, J, K, M, then; __m128i ARCS, MNODES, INFINITE, NEGINF, prsse, Psse, MAXLA, ARGMAXLA, LA, mask1, mask2, mask3, COST; for(i = 0; i <= nodes; i++) { cost_tab[i] = 0; } if(check_s_t(s, t, P, nodes) != 0) { return 1; } while(P[s] == INF) { k = -1; m = -1; //printf("j = %d\n", j); J = _mm_set1_epi32(j); //aktualna wartosc j K = _mm_set1_epi32(-1); //poczatkowy indeks w tablicy z kosztami krawedzi M = _mm_set1_epi32(-1); //koncowy indeks w tablicy z kosztami krawedzi MNODES = _mm_set1_epi32(nodes-1); //liczba wezlow pomniejszona o 1 (do sprawdzenia czy koniec tablicy) ARCS = _mm_set1_epi32(arcs); //liczba krawedzi /* wyliczenie k, m */ for(i = 0; i < nodes; i+=4) { ai0sse = _mm_load_si128((__m128i*) &ai0[i]); //ladowanie ai0 (numerow wezlow) ai1sse = _mm_load_si128((__m128i*) &ai1[i]); //ladowanie ai1 (indeksow w tablicy z krawedziami) ai1sse1 = _mm_set_epi32(ai1[i+4],ai1[i+3],ai1[i+2],ai1[i+1]); //ladowanie indeksow z ai1 przesunietych o 1 mask1 = _mm_cmpeq_epi32(J, ai0sse); //sprawdzenie warunku j == ai0[i] K = _mm_or_si128(_mm_and_si128(mask1,ai1sse), _mm_andnot_si128(mask1,K)); //ustalenie K I = _mm_set_epi32(i+3, i+2, i+1, i); //aktualne wartosci i mask2 = _mm_cmplt_epi32(I, MNODES); //sprawdzenie warunku i == nodes-1 mask3 = _mm_and_si128(mask1,mask2); //sprawdzenie sumy warunkow 1 i 2 then = _mm_or_si128(_mm_and_si128(mask2,ai1sse1), _mm_andnot_si128(mask2,ARCS)); //m = ai1[i+1] lub arcs M = _mm_or_si128(_mm_and_si128(mask3,then), _mm_andnot_si128(mask3,M)); //ustalenie M } for(i = 0; i < nodes; i++) { if(ai0[i] == j) { k = ai1[i]; //k - indeks startowy krawedzi wychodzacych z j //printf("i = %d ", i); if(i < nodes - 1) { m = ai1[i+1]; } else { m = arcs; } } } /* zapisanie k, m */ for(i = 0; i < 4; i++) { tmp1 = get_from_m128i(K,i); tmp2 = get_from_m128i(M,i); if(tmp1 != -1) { k = tmp1; } if(tmp2 != -1) { m = tmp2; } } //printf("K,M: %d %d\n", k, m); /* wybor optymalnej krawedzi */ if(k != -1) { INFINITE = _mm_set1_epi32(INF); //wartosc "nieskonczona" NEGINF = _mm_set1_epi32(0-INF); //wartosc -INF COST = _mm_set1_epi32(cost); //koszt wybranej krawedzi MAXLA = _mm_set1_epi32(0-INF); //maksymalna wartosc la = pr[a0[i]] - a1[i] ARGMAXLA = _mm_set1_epi32(-1); //indeks dla którego la jest najwieksza for(i = k; i < m; i+=4) { a1sse = _mm_set_epi32(a1[i],a1[i+1],a1[i+2],a1[i+3]); //ladowanie a1 a0sse = _mm_set_epi32(a0[i],a0[i+1],a0[i+2],a0[i+3]); //ladowanie a0 prsse = _mm_set_epi32(pr[a0[i]],pr[a0[i+1]],pr[a0[i+2]],pr[a0[i+3]]); //ladowanie pr Psse = _mm_set_epi32(P[a0[i]],P[a0[i+1]],P[a0[i+2]],P[a0[i+3]]); //ladowanie P mask1 = _mm_cmpgt_epi32(_mm_set1_epi32(m),_mm_set_epi32(i,i+1,i+2,i+3)); //czy ostatni obieg prsse = _mm_or_si128(_mm_and_si128(mask1,prsse), _mm_andnot_si128(mask1,NEGINF)); //obciecie cudzych lukow LA = _mm_sub_epi32(prsse, a1sse); //la = pr[a0[i]] - a1[i] then = _mm_max_epi32(LA,MAXLA); //maksymalna wartość la, maxla mask1 = _mm_cmpeq_epi32(Psse,INFINITE); //czy P[i] == INF mask2 = _mm_and_si128(mask1,_mm_cmpgt_epi32(LA,MAXLA)); //czy P[i] == INF i LA > MAXLA MAXLA = _mm_or_si128(_mm_and_si128(mask1,then), _mm_andnot_si128(mask1,MAXLA)); //aktualizacja maxla ARGMAXLA = _mm_or_si128(_mm_and_si128(mask2,a0sse), _mm_andnot_si128(mask2,ARGMAXLA)); //aktualizacja argmaxla COST = _mm_or_si128(_mm_and_si128(mask2,a1sse), _mm_andnot_si128(mask2,COST)); //aktualizacja cost } } /* zapisanie maxla, argmaxla, cost */ maxla = 0 - INF; for(i = 0; i < 4; i++) { tmp1 = get_from_m128i(MAXLA,i); if(tmp1 > maxla) { argmaxla = get_from_m128i(ARGMAXLA,i); maxla = tmp1; cost = get_from_m128i(COST,i); } } //printf("COST: %d, PATH_COST: %d\n", cost, path_cost); //printf("pr[j] = %d, maxla = %d, argmaxla = %d\n", pr[j], maxla, argmaxla); /* skrocenie sciezki */ if(pr[j] > maxla || maxla == -INF) { /* uaktualnienie ceny */ pr[j] = maxla; /* sciezka jednoelementowa nie jest skracana */ if(j != t) { /* uaktualnienie sciezki */ P[j] = INF; length = length - 1; path_cost = path_cost - cost_tab[length]; cost_tab[length] = 0; /* powrot do poprzedniego wierzcholka w sciezce (j), k - odcinany */ k = j; for(i = 0; i < nodes; i++) { if(P[i] == length - 1) { j = i; break; } } } } /* przedluzenie sciezki */ else { P[argmaxla] = length; j = argmaxla; path_cost = path_cost + cost; cost_tab[length] = cost; length = length + 1; /* sciezka doszla do wierzcholka startowego => koniec */ if(argmaxla == s) { printf("dlugosc sciezki: %d\n", path_cost); return 0; } } } return 0; }
static inline __m128i SkMin32_SSE2(const __m128i& a, const __m128i& b) { __m128i cmp = _mm_cmplt_epi32(a, b); return _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, b)); }
static inline int sacIsSampleDegenerate(PROSAC_HEST* p){ unsigned i0 = p->smpl[0], i1 = p->smpl[1], i2 = p->smpl[2], i3 = p->smpl[3]; /** * Pack the matches selected by the SAC algorithm. * Must be packed points[0:7] = {srcx0, srcy0, srcx1, srcy1, srcx2, srcy2, srcx3, srcy3} * points[8:15] = {dstx0, dsty0, dstx1, dsty1, dstx2, dsty2, dstx3, dsty3} * Gather 4 points into the vector */ __m128 src10 = _mm_loadl_pi(src10, (__m64*)&p->src[i0]); src10 = _mm_loadh_pi(src10, (__m64*)&p->src[i1]); __m128 src32 = _mm_loadl_pi(src32, (__m64*)&p->src[i2]); src32 = _mm_loadh_pi(src32, (__m64*)&p->src[i3]); __m128 dst10 = _mm_loadl_pi(dst10, (__m64*)&p->dst[i0]); dst10 = _mm_loadh_pi(dst10, (__m64*)&p->dst[i1]); __m128 dst32 = _mm_loadl_pi(dst32, (__m64*)&p->dst[i2]); dst32 = _mm_loadh_pi(dst32, (__m64*)&p->dst[i3]); /** * If the matches' source points have common x and y coordinates, abort. */ /** * Check: * packedPoints[0].x == packedPoints[2].x * packedPoints[0].y == packedPoints[2].y * packedPoints[1].x == packedPoints[3].x * packedPoints[1].y == packedPoints[3].y */ __m128 chkEq0 = _mm_cmpeq_ps(src10, src32); /** * Check: * packedPoints[1].x == packedPoints[2].x * packedPoints[1].y == packedPoints[2].y * packedPoints[0].x == packedPoints[3].x * packedPoints[0].y == packedPoints[3].y */ __m128 chkEq1 = _mm_cmpeq_ps(_mm_shuffle_ps(src10, src10, _MM_SHUFFLE(1, 0, 3, 2)), src32); /** * Check: * packedPoints[0].x == packedPoints[1].x * packedPoints[0].y == packedPoints[1].y * packedPoints[2].x == packedPoints[3].x * packedPoints[2].y == packedPoints[3].y */ __m128 chkEq2 = _mm_cmpeq_ps(_mm_shuffle_ps(src10, src32, _MM_SHUFFLE(1, 0, 1, 0)), _mm_shuffle_ps(src10, src32, _MM_SHUFFLE(3, 2, 3, 2))); /* Verify */ if(_mm_movemask_ps(_mm_or_ps(chkEq0, _mm_or_ps(chkEq1, chkEq2)))){ return 1; } /* If the matches do not satisfy the strong geometric constraint, abort. */ /** * p6420x = (p6.x, p4.x, p2.x, p0.x) * p6420y = (p6.y, p4.y, p2.y, p0.y) * p7531x = (p7.x, p5.x, p3.x, p1.x) * p7531y = (p7.y, p5.y, p3.y, p1.y) * crosssd0 = p6420y - p7531y = (cross2d0, cross0d0, cross2s0, cross0s0) * crosssd1 = p7531x - p6420x = (cross2d1, cross0d1, cross2s1, cross0s1) * crosssd2 = p6420x * p7531y - p6420y * p7531x = (cross2d2, cross0d2, cross2s2, cross0s2) * * shufcrosssd0 = (cross0d0, cross2d0, cross0s0, cross2s0) * shufcrosssd1 = (cross0d1, cross2d1, cross0s1, cross2s1) * shufcrosssd2 = (cross0d2, cross2d2, cross0s2, cross2s2) * * dotsd0 = shufcrosssd0 * p6420x + * shufcrosssd1 * p6420y + * shufcrosssd2 * = (dotd0, dotd2, dots0, dots2) * dotsd1 = shufcrosssd0 * p7531x + * shufcrosssd1 * p7531y + * shufcrosssd2 * = (dotd1, dotd3, dots1, dots3) * * dots = shufps(dotsd0, dotsd1, _MM_SHUFFLE(1, 0, 1, 0)) * dotd = shufps(dotsd0, dotsd1, _MM_SHUFFLE(3, 2, 3, 2)) * movmaskps(dots ^ dotd) */ __m128 p3210x = _mm_shuffle_ps(src10, src32, _MM_SHUFFLE(2, 0, 2, 0)); __m128 p3210y = _mm_shuffle_ps(src10, src32, _MM_SHUFFLE(3, 1, 3, 1)); __m128 p7654x = _mm_shuffle_ps(dst10, dst32, _MM_SHUFFLE(2, 0, 2, 0)); __m128 p7654y = _mm_shuffle_ps(dst10, dst32, _MM_SHUFFLE(3, 1, 3, 1)); __m128 p6420x = _mm_shuffle_ps(p3210x, p7654x, _MM_SHUFFLE(2, 0, 2, 0)); __m128 p6420y = _mm_shuffle_ps(p3210y, p7654y, _MM_SHUFFLE(2, 0, 2, 0)); __m128 p7531x = _mm_shuffle_ps(p3210x, p7654x, _MM_SHUFFLE(3, 1, 3, 1)); __m128 p7531y = _mm_shuffle_ps(p3210y, p7654y, _MM_SHUFFLE(3, 1, 3, 1)); __m128 crosssd0 = _mm_sub_ps(p6420y, p7531y); __m128 crosssd1 = _mm_sub_ps(p7531x, p6420x); __m128 crosssd2 = _mm_sub_ps(_mm_mul_ps(p6420x, p7531y), _mm_mul_ps(p6420y, p7531x)); __m128 shufcrosssd0 = _mm_shuffle_ps(crosssd0, crosssd0, _MM_SHUFFLE(2, 3, 0, 1)); __m128 shufcrosssd1 = _mm_shuffle_ps(crosssd1, crosssd1, _MM_SHUFFLE(2, 3, 0, 1)); __m128 shufcrosssd2 = _mm_shuffle_ps(crosssd2, crosssd2, _MM_SHUFFLE(2, 3, 0, 1)); __m128 dotsd0 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(shufcrosssd0, p6420x), _mm_mul_ps(shufcrosssd1, p6420y)), shufcrosssd2); __m128 dotsd1 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(shufcrosssd0, p7531x), _mm_mul_ps(shufcrosssd1, p7531y)), shufcrosssd2); __m128 dots = _mm_shuffle_ps(dotsd0, dotsd1, _MM_SHUFFLE(0, 1, 0, 1)); __m128 dotd = _mm_shuffle_ps(dotsd0, dotsd1, _MM_SHUFFLE(2, 3, 2, 3)); //if(_mm_movemask_ps(_mm_cmpge_ps(_mm_setzero_ps(), _mm_mul_ps(dots, dotd)))){ if(_mm_movemask_epi8(_mm_cmplt_epi32(_mm_xor_si128(_mm_cvtps_epi32(dots), _mm_cvtps_epi32(dotd)), _mm_setzero_si128()))){ return 1; } /* Otherwise, proceed with evaluation */ _mm_store_ps((float*)&p->pkdPts[0], src10); _mm_store_ps((float*)&p->pkdPts[2], src32); _mm_store_ps((float*)&p->pkdPts[4], dst10); _mm_store_ps((float*)&p->pkdPts[6], dst32); return 0; }
static void GF_FUNC_ALIGN VS_CC proc_16bit_sse2(convolution_hv_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t *p0 = (uint16_t *)buff + 8; uint16_t *p1 = p0 + bstride; uint16_t *p2 = p1 + bstride; uint16_t *p3 = p2 + bstride; uint16_t *p4 = p3 + bstride; uint16_t *orig = p0, *end = p4; line_copy16(p0, srcp + 2 * stride, width, 2); line_copy16(p1, srcp + stride, width, 2); line_copy16(p2, srcp, width, 2); srcp += stride; line_copy16(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128i all1 = _mm_cmpeq_epi32(zero, zero); __m128i one = _mm_srli_epi32(all1, 31); __m128 rdiv_h = _mm_set1_ps((float)ch->rdiv_h); __m128 rdiv_v = _mm_set1_ps((float)ch->rdiv_v); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i matrix_h[5]; __m128i matrix_v[5]; int sign_h[5]; int sign_v[5]; for (int i = 0; i < 5; i++) { sign_h[i] = ch->m_h[i] < 0 ? 1 : 0; sign_v[i] = ch->m_v[i] < 0 ? 1 : 0; uint16_t val = sign_h[i] ? (uint16_t)(ch->m_h[i] * -1) : (uint16_t)ch->m_h[i]; matrix_h[i] = _mm_set1_epi16((int16_t)val); val = sign_v[i] ? (uint16_t)(ch->m_v[i] * -1) : (uint16_t)ch->m_v[i]; matrix_v[i] = _mm_set1_epi16((int16_t)val); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy16(p4, srcp, width, 2); for (int x = 0; x < width; x += 8) { uint16_t *array[] = { p0 + x, p1 + x, p2 + x, p3 + x, p4 + x, p2 + x - 2, p2 + x - 1, dstp + x, p2 + x + 1, p2 + x + 2 }; for (int j = 0; j < 2; j++) { __m128i *matrix = j == 0 ? matrix_v : matrix_h; int *sign = j == 0 ? sign_v : sign_h; __m128 rdiv = j == 0 ? rdiv_v : rdiv_h; __m128i sum[2]; sum[0] = _mm_setzero_si128(); sum[1] = _mm_setzero_si128(); for (int i = 0; i < 5; i++) { __m128i xmm0, xmm1, xmm2; xmm0 = _mm_loadu_si128((__m128i *)array[i + j * 5]); xmm1 = _mm_mullo_epi16(xmm0, matrix[i]); xmm0 = _mm_mulhi_epu16(xmm0, matrix[i]); xmm2 = _mm_unpacklo_epi16(xmm1, xmm0); xmm0 = _mm_unpackhi_epi16(xmm1, xmm0); if (sign[i]) { xmm2 = _mm_add_epi32(one, _mm_xor_si128(xmm2, all1)); xmm0 = _mm_add_epi32(one, _mm_xor_si128(xmm0, all1)); } sum[0] = _mm_add_epi32(sum[0], xmm2); sum[1] = _mm_add_epi32(sum[1], xmm0); } for (int i = 0; i < 2; i++) { __m128 sumfp; __m128i mask, temp; sumfp = _mm_cvtepi32_ps(sum[i]); sumfp = _mm_mul_ps(sumfp, rdiv); if (j == 1) { sumfp = _mm_add_ps(sumfp, bias); } sum[i] = _mm_cvttps_epi32(sumfp); temp = _mm_srli_epi32(all1, 16); mask = _mm_cmplt_epi32(sum[i], temp); sum[i] = _mm_or_si128(_mm_and_si128(sum[i], mask), _mm_andnot_si128(mask, temp)); mask = _mm_cmpgt_epi32(sum[i], zero); if (ch->saturate) { sum[i] = _mm_and_si128(mask, sum[i]); } else { temp = _mm_add_epi32(one, _mm_xor_si128(sum[i], all1)); sum[i] = _mm_or_si128(_mm_and_si128(mask, sum[i]), _mm_andnot_si128(mask, temp)); } } sum[0] = mm_cast_epi32(sum[0], sum[1]); _mm_store_si128((__m128i *)(dstp + x), sum[0]); } } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
//----------------------------------------------------------------------------------------- // Rasterize the occludee AABB and depth test it against the CPU rasterized depth buffer // If any of the rasterized AABB pixels passes the depth test exit early and mark the occludee // as visible. If all rasterized AABB pixels are occluded then the occludee is culled //----------------------------------------------------------------------------------------- void TransformedAABBoxSSE::RasterizeAndDepthTestAABBox(UINT *pRenderTargetPixels) { // Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster) // Denormal are zero (DAZ) is bit 6 and Flush to zero (FZ) is bit 15. // so to enable the two to have to set bits 6 and 15 which 1000 0000 0100 0000 = 0x8040 _mm_setcsr( _mm_getcsr() | 0x8040 ); __m128i colOffset = _mm_set_epi32(0, 1, 0, 1); __m128i rowOffset = _mm_set_epi32(0, 0, 1, 1); __m128i fxptZero = _mm_setzero_si128(); float* pDepthBuffer = (float*)pRenderTargetPixels; // Rasterize the AABB triangles 4 at a time for(UINT i = 0; i < AABB_TRIANGLES; i += SSE) { vFloat4 xformedPos[3]; Gather(xformedPos, i); // use fixed-point only for X and Y. Avoid work for Z and W. vFxPt4 xFormedFxPtPos[3]; for(int m = 0; m < 3; m++) { xFormedFxPtPos[m].X = _mm_cvtps_epi32(xformedPos[m].X); xFormedFxPtPos[m].Y = _mm_cvtps_epi32(xformedPos[m].Y); xFormedFxPtPos[m].Z = _mm_cvtps_epi32(xformedPos[m].Z); xFormedFxPtPos[m].W = _mm_cvtps_epi32(xformedPos[m].W); } // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 // Compute A = (ya - yb) for the 3 line segments that make up each triangle __m128i A0 = _mm_sub_epi32(xFormedFxPtPos[1].Y, xFormedFxPtPos[2].Y); __m128i A1 = _mm_sub_epi32(xFormedFxPtPos[2].Y, xFormedFxPtPos[0].Y); __m128i A2 = _mm_sub_epi32(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y); // Compute B = (xb - xa) for the 3 line segments that make up each triangle __m128i B0 = _mm_sub_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[1].X); __m128i B1 = _mm_sub_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[2].X); __m128i B2 = _mm_sub_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[0].X); // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle __m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[2].Y), _mm_mullo_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[1].Y)); __m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[0].Y), _mm_mullo_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[2].Y)); __m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[1].Y), _mm_mullo_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[0].Y)); // Compute triangle area __m128i triArea = _mm_mullo_epi32(A0, xFormedFxPtPos[0].X); triArea = _mm_add_epi32(triArea, _mm_mullo_epi32(B0, xFormedFxPtPos[0].Y)); triArea = _mm_add_epi32(triArea, C0); __m128 oneOverTriArea = _mm_div_ps(_mm_set1_ps(1.0f), _mm_cvtepi32_ps(triArea)); // Use bounding box traversal strategy to determine which pixels to rasterize __m128i startX = _mm_and_si128(Max(Min(Min(xFormedFxPtPos[0].X, xFormedFxPtPos[1].X), xFormedFxPtPos[2].X), _mm_set1_epi32(0)), _mm_set1_epi32(0xFFFFFFFE)); __m128i endX = Min(_mm_add_epi32(Max(Max(xFormedFxPtPos[0].X, xFormedFxPtPos[1].X), xFormedFxPtPos[2].X), _mm_set1_epi32(1)), _mm_set1_epi32(SCREENW)); __m128i startY = _mm_and_si128(Max(Min(Min(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y), xFormedFxPtPos[2].Y), _mm_set1_epi32(0)), _mm_set1_epi32(0xFFFFFFFE)); __m128i endY = Min(_mm_add_epi32(Max(Max(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y), xFormedFxPtPos[2].Y), _mm_set1_epi32(1)), _mm_set1_epi32(SCREENH)); for(int vv = 0; vv < 3; vv++) { // If W (holding 1/w in our case) is not between 0 and 1, // then vertex is behind near clip plane (1.0 in our case. // If W < 1, then verify 1/W > 1 (for W>0), and 1/W < 0 (for W < 0). __m128 nearClipMask0 = _mm_cmple_ps(xformedPos[vv].W, _mm_set1_ps(0.0f)); __m128 nearClipMask1 = _mm_cmpge_ps(xformedPos[vv].W, _mm_set1_ps(1.0f)); __m128 nearClipMask = _mm_or_ps(nearClipMask0, nearClipMask1); if(!_mm_test_all_zeros(*(__m128i*)&nearClipMask, *(__m128i*)&nearClipMask)) { // All four vertices are behind the near plane (we're processing four triangles at a time w/ SSE) *mVisible = true; return; } } // Now we have 4 triangles set up. Rasterize them each individually. for(int lane=0; lane < SSE; lane++) { // Skip triangle if area is zero if(triArea.m128i_i32[lane] <= 0) { continue; } // Extract this triangle's properties from the SIMD versions __m128 zz[3], oneOverW[3]; for(int vv = 0; vv < 3; vv++) { zz[vv] = _mm_set1_ps(xformedPos[vv].Z.m128_f32[lane]); oneOverW[vv] = _mm_set1_ps(xformedPos[vv].W.m128_f32[lane]); } __m128 oneOverTotalArea = _mm_set1_ps(oneOverTriArea.m128_f32[lane]); zz[0] *= oneOverTotalArea; zz[1] *= oneOverTotalArea; zz[2] *= oneOverTotalArea; int startXx = startX.m128i_i32[lane]; int endXx = endX.m128i_i32[lane]; int startYy = startY.m128i_i32[lane]; int endYy = endY.m128i_i32[lane]; __m128i aa0 = _mm_set1_epi32(A0.m128i_i32[lane]); __m128i aa1 = _mm_set1_epi32(A1.m128i_i32[lane]); __m128i aa2 = _mm_set1_epi32(A2.m128i_i32[lane]); __m128i bb0 = _mm_set1_epi32(B0.m128i_i32[lane]); __m128i bb1 = _mm_set1_epi32(B1.m128i_i32[lane]); __m128i bb2 = _mm_set1_epi32(B2.m128i_i32[lane]); __m128i cc0 = _mm_set1_epi32(C0.m128i_i32[lane]); __m128i cc1 = _mm_set1_epi32(C1.m128i_i32[lane]); __m128i cc2 = _mm_set1_epi32(C2.m128i_i32[lane]); __m128i aa0Inc = _mm_slli_epi32(aa0, 1); __m128i aa1Inc = _mm_slli_epi32(aa1, 1); __m128i aa2Inc = _mm_slli_epi32(aa2, 1); __m128i row, col; int rowIdx; // To avoid this branching, choose one method to traverse and store the pixel depth if(gVisualizeDepthBuffer) { // Sequentially traverse and store pixel depths contiguously rowIdx = (startYy * SCREENW + startXx); } else { // Tranverse pixels in 2x2 blocks and store 2x2 pixel quad depths contiguously in memory ==> 2*X // This method provides better perfromance rowIdx = (startYy * SCREENW + 2 * startXx); } col = _mm_add_epi32(colOffset, _mm_set1_epi32(startXx)); __m128i aa0Col = _mm_mullo_epi32(aa0, col); __m128i aa1Col = _mm_mullo_epi32(aa1, col); __m128i aa2Col = _mm_mullo_epi32(aa2, col); row = _mm_add_epi32(rowOffset, _mm_set1_epi32(startYy)); __m128i bb0Row = _mm_add_epi32(_mm_mullo_epi32(bb0, row), cc0); __m128i bb1Row = _mm_add_epi32(_mm_mullo_epi32(bb1, row), cc1); __m128i bb2Row = _mm_add_epi32(_mm_mullo_epi32(bb2, row), cc2); __m128i bb0Inc = _mm_slli_epi32(bb0, 1); __m128i bb1Inc = _mm_slli_epi32(bb1, 1); __m128i bb2Inc = _mm_slli_epi32(bb2, 1); // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) for(int r = startYy; r < endYy; r += 2, row = _mm_add_epi32(row, _mm_set1_epi32(2)), rowIdx = rowIdx + 2 * SCREENW, bb0Row = _mm_add_epi32(bb0Row, bb0Inc), bb1Row = _mm_add_epi32(bb1Row, bb1Inc), bb2Row = _mm_add_epi32(bb2Row, bb2Inc)) { // Compute barycentric coordinates int idx = rowIdx; __m128i alpha = _mm_add_epi32(aa0Col, bb0Row); __m128i beta = _mm_add_epi32(aa1Col, bb1Row); __m128i gama = _mm_add_epi32(aa2Col, bb2Row); int idxIncr; if(gVisualizeDepthBuffer) { idxIncr = 2; } else { idxIncr = 4; } for(int c = startXx; c < endXx; c += 2, idx = idx + idxIncr, alpha = _mm_add_epi32(alpha, aa0Inc), beta = _mm_add_epi32(beta, aa1Inc), gama = _mm_add_epi32(gama, aa2Inc)) { //Test Pixel inside triangle __m128i mask = _mm_cmplt_epi32(fxptZero, _mm_or_si128(_mm_or_si128(alpha, beta), gama)); // Early out if all of this quad's pixels are outside the triangle. if(_mm_test_all_zeros(mask, mask)) { continue; } // Compute barycentric-interpolated depth __m128 depth = _mm_mul_ps(_mm_cvtepi32_ps(alpha), zz[0]); depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(beta), zz[1])); depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(gama), zz[2])); __m128 previousDepthValue; if(gVisualizeDepthBuffer) { previousDepthValue = _mm_set_ps(pDepthBuffer[idx], pDepthBuffer[idx + 1], pDepthBuffer[idx + SCREENW], pDepthBuffer[idx + SCREENW + 1]); } else { previousDepthValue = *(__m128*)&pDepthBuffer[idx]; } __m128 depthMask = _mm_cmpge_ps( depth, previousDepthValue); __m128i finalMask = _mm_and_si128( mask, _mm_castps_si128(depthMask)); if(!_mm_test_all_zeros(finalMask, finalMask)) { *mVisible = true; return; //early exit } }//for each column }// for each row }// for each triangle }// for each set of SIMD# triangles }
int camCompareDescriptors(const int *desc1, const int *desc2, const int s) { int i, j, distance = 0; __m128i sum, d1, d2, md, d, cmp; __m128i *p1 = (__m128i*)desc1, *p2 = (__m128i*)desc2; ALIGN(int out_sse[4], 16); /* Looks like a good idea... But this deteriorates performance... // Software prefetch d1 = _mm_load_si128(p1); d2 = _mm_load_si128(p2); for (i = 0; i != s; i += 32) { _mm_prefetch(&desc1[i], _MM_HINT_NTA); _mm_prefetch(&desc2[i], _MM_HINT_NTA); } */ sum = _mm_setzero_si128(); for (i = 0; i != s >> 4; i++) { // 32-bits SAD for 4 integers in parallel d1 = _mm_loadu_si128(p1++); d2 = _mm_loadu_si128(p2++); d = _mm_sub_epi32(d1, d2); md = _mm_sub_epi32(d2, d1); cmp = _mm_cmplt_epi32(d, _mm_setzero_si128()); md = _mm_and_si128(cmp, md); d = _mm_andnot_si128(cmp, d); sum = _mm_add_epi32(sum, md); sum = _mm_add_epi32(sum, d); // 32-bits SAD for 4 integers in parallel d1 = _mm_loadu_si128(p1++); d2 = _mm_loadu_si128(p2++); d = _mm_sub_epi32(d1, d2); md = _mm_sub_epi32(d2, d1); cmp = _mm_cmplt_epi32(d, _mm_setzero_si128()); md = _mm_and_si128(cmp, md); d = _mm_andnot_si128(cmp, d); sum = _mm_add_epi32(sum, md); sum = _mm_add_epi32(sum, d); // 32-bits SAD for 4 integers in parallel d1 = _mm_loadu_si128(p1++); d2 = _mm_loadu_si128(p2++); d = _mm_sub_epi32(d1, d2); md = _mm_sub_epi32(d2, d1); cmp = _mm_cmplt_epi32(d, _mm_setzero_si128()); md = _mm_and_si128(cmp, md); d = _mm_andnot_si128(cmp, d); sum = _mm_add_epi32(sum, md); sum = _mm_add_epi32(sum, d); // 32-bits SAD for 4 integers in parallel d1 = _mm_loadu_si128(p1++); d2 = _mm_loadu_si128(p2++); d = _mm_sub_epi32(d1, d2); md = _mm_sub_epi32(d2, d1); cmp = _mm_cmplt_epi32(d, _mm_setzero_si128()); md = _mm_and_si128(cmp, md); d = _mm_andnot_si128(cmp, d); sum = _mm_add_epi32(sum, md); sum = _mm_add_epi32(sum, d); } _mm_store_si128((__m128i*)out_sse, sum); return out_sse[0] + out_sse[1] + out_sse[2] + out_sse[3]; }
inline FORCE_INLINE __m128i mm_min_epi32(__m128i a, __m128i b) { __m128i mask = _mm_cmplt_epi32(a, b); return mm_blendv_ps(a, b, mask); }
/***************************************************************************** * This function utilises 3 properties of the cost function lookup tables, * * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * * vp9_encoder.c. * * For the joint cost: * * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * * For the component costs: * * - For all i: mvsadcost[0][i] == mvsadcost[1][i] * * (Equal costs for both components) * * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * * (Cost function is even) * * If these do not hold, then this function cannot be used without * * modification, in which case you can revert to using the C implementation, * * which does not rely on these properties. * *****************************************************************************/ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv) { const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max); const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int); const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min); const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int); const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit); const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]); const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]); // search_param determines the length of the initial step and hence the number // of iterations. // 0 = initial step (MAX_FIRST_STEP) pel // 1 = (MAX_FIRST_STEP/2) pel, // 2 = (MAX_FIRST_STEP/4) pel... const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param]; const int tot_steps = cfg->total_steps - search_param; const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int); const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); int_mv bmv = pack_int_mv(ref_row, ref_col); int_mv new_bmv = bmv; __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int); const int what_stride = x->plane[0].src.stride; const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; const uint8_t *const what = x->plane[0].src.buf; const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; // Work out the start point for the search const uint8_t *best_address = in_what; const uint8_t *new_best_address = best_address; #if ARCH_X86_64 __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address); #else __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address); #endif unsigned int best_sad; int i; int j; int step; // Check the prerequisite cost function properties that are easy to check // in an assert. See the function-level documentation for details on all // prerequisites. assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); // Check the starting position best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); *num00 = 0; for (i = 0, step = 0; step < tot_steps; step++) { for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) { __m128i v_sad_d; __m128i v_cost_d; __m128i v_outside_d; __m128i v_inside_d; __m128i v_diff_mv_w; #if ARCH_X86_64 __m128i v_blocka[2]; #else __m128i v_blocka[1]; #endif // Compute the candidate motion vectors const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]); const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w); // Clamp them to the search bounds __m128i v_these_mv_clamp_w = v_these_mv_w; v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w); v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w); // The ones that did not change are inside the search area v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w); // If none of them are inside, then move on if (__likely__(_mm_test_all_zeros(v_inside_d, v_inside_d))) { continue; } // The inverse mask indicates which of the MVs are outside v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff)); // Shift right to keep the sign bit clear, we will use this later // to set the cost to the maximum value. v_outside_d = _mm_srli_epi32(v_outside_d, 1); // Compute the difference MV v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv); // We utilise the fact that the cost function is even, and use the // absolute difference. This allows us to use unsigned indexes later // and reduces cache pressure somewhat as only a half of the table // is ever referenced. v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w); // Compute the SIMD pointer offsets. { #if ARCH_X86_64 // sizeof(intptr_t) == 8 // Load the offsets __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]); __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]); // Set the ones falling outside to zero v_bo10_q = _mm_and_si128(v_bo10_q, _mm_cvtepi32_epi64(v_inside_d)); v_bo32_q = _mm_and_si128(v_bo32_q, _mm_unpackhi_epi32(v_inside_d, v_inside_d)); // Compute the candidate addresses v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q); v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q); #else // ARCH_X86 // sizeof(intptr_t) == 4 __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]); v_bo_d = _mm_and_si128(v_bo_d, v_inside_d); v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d); #endif } fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], in_what_stride, (uint32_t*)&v_sad_d); // Look up the component cost of the residual motion vector { const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0); const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1); const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2); const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3); const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4); const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5); const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6); const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7); // Note: This is a use case for vpgather in AVX2 const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0]; const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1]; const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2]; const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3]; __m128i v_cost_10_d, v_cost_32_d; v_cost_10_d = _mm_cvtsi32_si128(cost0); v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1); v_cost_32_d = _mm_cvtsi32_si128(cost2); v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1); v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d); } // Now add in the joint cost { const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w, _mm_setzero_si128()); const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d, v_joint_cost_0_d, v_sel_d); v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d); } // Multiply by sad_per_bit v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d); // ROUND_POWER_OF_TWO(v_cost_d, 8) v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80)); v_cost_d = _mm_srai_epi32(v_cost_d, 8); // Add the cost to the sad v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d); // Make the motion vectors outside the search area have max cost // by or'ing in the comparison mask, this way the minimum search won't // pick them. v_sad_d = _mm_or_si128(v_sad_d, v_outside_d); // Find the minimum value and index horizontally in v_sad_d { // Try speculatively on 16 bits, so we can use the minpos intrinsic const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d); const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w); uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0); uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1); // If the local best value is not saturated, just use it, otherwise // find the horizontal minimum again the hard way on 32 bits. // This is executed rarely. if (__unlikely__(local_best_sad == 0xffff)) { __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d; v_loval_d = v_sad_d; v_loidx_d = _mm_set_epi32(3, 2, 1, 0); v_hival_d = _mm_srli_si128(v_loval_d, 8); v_hiidx_d = _mm_srli_si128(v_loidx_d, 8); v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); v_hival_d = _mm_srli_si128(v_loval_d, 4); v_hiidx_d = _mm_srli_si128(v_loidx_d, 4); v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); local_best_sad = _mm_extract_epi32(v_loval_d, 0); local_best_idx = _mm_extract_epi32(v_loidx_d, 0); } // Update the global minimum if the local minimum is smaller if (__likely__(local_best_sad < best_sad)) { new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; new_best_address = ((const uint8_t **)v_blocka)[local_best_idx]; best_sad = local_best_sad; } } } bmv = new_bmv; best_address = new_best_address; v_bmv_w = _mm_set1_epi32(bmv.as_int); #if ARCH_X86_64 v_ba_q = _mm_set1_epi64x((intptr_t)best_address); #else v_ba_d = _mm_set1_epi32((intptr_t)best_address); #endif if (__unlikely__(best_address == in_what)) { (*num00)++; } } *best_mv = bmv.as_mv; return best_sad; }
/** * See av1_wedge_sign_from_residuals_c */ int av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m, int N, int64_t limit) { int64_t acc; __m128i v_sign_d; __m128i v_acc0_d = _mm_setzero_si128(); __m128i v_acc1_d = _mm_setzero_si128(); __m128i v_acc_q; // Input size limited to 8192 by the use of 32 bit accumulators and m // being between [0, 64]. Overflow might happen at larger sizes, // though it is practically impossible on real video input. assert(N < 8192); assert(N % 64 == 0); do { const __m128i v_m01_b = xx_load_128(m); const __m128i v_m23_b = xx_load_128(m + 16); const __m128i v_m45_b = xx_load_128(m + 32); const __m128i v_m67_b = xx_load_128(m + 48); const __m128i v_d0_w = xx_load_128(ds); const __m128i v_d1_w = xx_load_128(ds + 8); const __m128i v_d2_w = xx_load_128(ds + 16); const __m128i v_d3_w = xx_load_128(ds + 24); const __m128i v_d4_w = xx_load_128(ds + 32); const __m128i v_d5_w = xx_load_128(ds + 40); const __m128i v_d6_w = xx_load_128(ds + 48); const __m128i v_d7_w = xx_load_128(ds + 56); const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128()); const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128()); const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128()); const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128()); const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128()); const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128()); const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128()); const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128()); const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w); const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w); const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w); const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w); const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w); const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w); const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w); const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w); const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d); const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d); const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d); const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d); const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d); const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d); v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d); v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d); ds += 64; m += 64; N -= 64; } while (N); v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128()); v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d), _mm_unpackhi_epi32(v_acc0_d, v_sign_d)); v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128()); v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d), _mm_unpackhi_epi32(v_acc1_d, v_sign_d)); v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d); v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); #if ARCH_X86_64 acc = (uint64_t)_mm_cvtsi128_si64(v_acc_q); #else xx_storel_64(&acc, v_acc_q); #endif return acc > limit; }