static int SSE4x4(const uint8_t* a, const uint8_t* b) { const __m128i zero = _mm_setzero_si128(); // Load values. Note that we read 8 pixels instead of 4, // but the a/b buffers are over-allocated to that effect. const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[BPS * 0]); const __m128i a1 = _mm_loadl_epi64((const __m128i*)&a[BPS * 1]); const __m128i a2 = _mm_loadl_epi64((const __m128i*)&a[BPS * 2]); const __m128i a3 = _mm_loadl_epi64((const __m128i*)&a[BPS * 3]); const __m128i b0 = _mm_loadl_epi64((const __m128i*)&b[BPS * 0]); const __m128i b1 = _mm_loadl_epi64((const __m128i*)&b[BPS * 1]); const __m128i b2 = _mm_loadl_epi64((const __m128i*)&b[BPS * 2]); const __m128i b3 = _mm_loadl_epi64((const __m128i*)&b[BPS * 3]); // Combine pair of lines. const __m128i a01 = _mm_unpacklo_epi32(a0, a1); const __m128i a23 = _mm_unpacklo_epi32(a2, a3); const __m128i b01 = _mm_unpacklo_epi32(b0, b1); const __m128i b23 = _mm_unpacklo_epi32(b2, b3); // Convert to 16b. const __m128i a01s = _mm_unpacklo_epi8(a01, zero); const __m128i a23s = _mm_unpacklo_epi8(a23, zero); const __m128i b01s = _mm_unpacklo_epi8(b01, zero); const __m128i b23s = _mm_unpacklo_epi8(b23, zero); // subtract, square and accumulate const __m128i d0 = _mm_subs_epi16(a01s, b01s); const __m128i d1 = _mm_subs_epi16(a23s, b23s); const __m128i e0 = _mm_madd_epi16(d0, d0); const __m128i e1 = _mm_madd_epi16(d1, d1); const __m128i sum = _mm_add_epi32(e0, e1); int32_t tmp[4]; _mm_storeu_si128((__m128i*)tmp, sum); return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); }
static INLINE void SIGNED_CLAMP_SUB(pi16 VD, pi16 VS, pi16 VT) { v16 dst, src, vco; v16 dif, res, xmm; src = _mm_load_si128((v16 *)VS); dst = _mm_load_si128((v16 *)VT); vco = _mm_load_si128((v16 *)cf_co); res = _mm_subs_epi16(src, dst); /* * Due to premature clamps in-between subtracting two of the three operands, * we must be careful not to offset the result accidentally when subtracting * the corresponding VCO flag AFTER the saturation from doing (VS - VT). */ dif = _mm_add_epi16(res, vco); dif = _mm_xor_si128(dif, res); /* Adding one suddenly inverts the sign? */ dif = _mm_and_si128(dif, dst); /* Sign change due to subtracting a neg. */ xmm = _mm_sub_epi16(src, dst); src = _mm_andnot_si128(src, dif); /* VS must be >= 0x0000 for overflow. */ xmm = _mm_and_si128(xmm, src); /* VS + VT != INT16_MIN; VS + VT >= +32768 */ xmm = _mm_srli_epi16(xmm, 15); /* src = (INT16_MAX + 1 === INT16_MIN) ? */ xmm = _mm_andnot_si128(xmm, vco); /* If it's NOT overflow, keep flag. */ res = _mm_subs_epi16(res, xmm); _mm_store_si128((v16 *)VD, res); return; }
static int SSE8x8(const uint8_t* a, const uint8_t* b) { const __m128i zero = _mm_setzero_si128(); int num_pairs = 4; __m128i sum = zero; int32_t tmp[4]; while (num_pairs-- > 0) { const __m128i a0 = LOAD_8x16b(&a[BPS * 0]); const __m128i a1 = LOAD_8x16b(&a[BPS * 1]); const __m128i b0 = LOAD_8x16b(&b[BPS * 0]); const __m128i b1 = LOAD_8x16b(&b[BPS * 1]); // subtract const __m128i c0 = _mm_subs_epi16(a0, b0); const __m128i c1 = _mm_subs_epi16(a1, b1); // multiply/accumulate with self const __m128i d0 = _mm_madd_epi16(c0, c0); const __m128i d1 = _mm_madd_epi16(c1, c1); // collect const __m128i sum01 = _mm_add_epi32(d0, d1); sum = _mm_add_epi32(sum, sum01); a += 2 * BPS; b += 2 * BPS; } _mm_storeu_si128((__m128i*)tmp, sum); return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); }
__m128i test_mm_subs_epi16(__m128i A, __m128i B) { // DAG-LABEL: test_mm_subs_epi16 // DAG: call <8 x i16> @llvm.x86.sse2.psubs.w // // ASM-LABEL: test_mm_subs_epi16 // ASM: psubsw return _mm_subs_epi16(A, B); }
static WEBP_INLINE __m128i SubtractAndAccumulate(const __m128i a, const __m128i b) { const __m128i zero = _mm_setzero_si128(); // convert to 16b const __m128i A0 = _mm_unpacklo_epi8(a, zero); const __m128i B0 = _mm_unpacklo_epi8(b, zero); const __m128i A1 = _mm_unpackhi_epi8(a, zero); const __m128i B1 = _mm_unpackhi_epi8(b, zero); // subtract const __m128i C0 = _mm_subs_epi16(A0, B0); const __m128i C1 = _mm_subs_epi16(A1, B1); // multiply with self const __m128i D0 = _mm_madd_epi16(C0, C0); const __m128i D1 = _mm_madd_epi16(C1, C1); // accumulate const __m128i sum = _mm_add_epi32(D0, D1); return sum; }
void ulsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms, int **rxdataF_comp, short *ulsch_llr, int **ul_ch_mag, int **ul_ch_magb, unsigned char symbol, unsigned short nb_rb) { __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; __m128i *ch_mag,*ch_magb; int j=0,i; // unsigned char symbol_mod; if (symbol == 0) llrU = ulsch_llr; // symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; ch_mag =(__m128i*)&ul_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)]; ch_magb =(__m128i*)&ul_ch_magb[0][(symbol*frame_parms->N_RB_DL*12)]; for (i=0;i<(nb_rb*3);i++) { mmtmpU1 = _mm_abs_epi16(rxF[i]); mmtmpU1 = _mm_subs_epi16(mmtmpU1,ch_mag[i]); mmtmpU2 = _mm_abs_epi16(mmtmpU1); mmtmpU2 = _mm_subs_epi16(mmtmpU2,ch_magb[i]); for (j=0;j<8;j++) { llrU[0] = ((short *)&rxF[i])[j]; llrU[1] = ((short *)&mmtmpU1)[j]; llrU[2] = ((short *)&mmtmpU2)[j]; llrU+=3; } } _mm_empty(); _m_empty(); }
SIMDValue SIMDInt16x8Operation::OpSubSaturate(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); x86Result.m128i_value = _mm_subs_epi16(tmpaValue.m128i_value, tmpbValue.m128i_value); // a - b saturates return X86SIMDValue::ToSIMDValue(x86Result); }
static void TransformAC3(const int16_t* in, uint8_t* dst) { static const int kC1 = 20091 + (1 << 16); static const int kC2 = 35468; const __m128i A = _mm_set1_epi16(in[0] + 4); const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2)); const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1)); const int c1 = MUL(in[1], kC2); const int d1 = MUL(in[1], kC1); const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1); const __m128i B = _mm_adds_epi16(A, CD); const __m128i m0 = _mm_adds_epi16(B, d4); const __m128i m1 = _mm_adds_epi16(B, c4); const __m128i m2 = _mm_subs_epi16(B, c4); const __m128i m3 = _mm_subs_epi16(B, d4); const __m128i zero = _mm_setzero_si128(); // Load the source pixels. __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS)); __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS)); __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS)); __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS)); // Convert to 16b. dst0 = _mm_unpacklo_epi8(dst0, zero); dst1 = _mm_unpacklo_epi8(dst1, zero); dst2 = _mm_unpacklo_epi8(dst2, zero); dst3 = _mm_unpacklo_epi8(dst3, zero); // Add the inverse transform. dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3)); dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3)); dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3)); dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3)); // Unsigned saturate to 8b. dst0 = _mm_packus_epi16(dst0, dst0); dst1 = _mm_packus_epi16(dst1, dst1); dst2 = _mm_packus_epi16(dst2, dst2); dst3 = _mm_packus_epi16(dst3, dst3); // Store the results. *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0); *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1); *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2); *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3); }
__m64 _m_psubsw(__m64 _MM1, __m64 _MM2) { __m128i lhs = {0}, rhs = {0}; lhs.m128i_i64[0] = _MM1.m64_i64; rhs.m128i_i64[0] = _MM2.m64_i64; lhs = _mm_subs_epi16(lhs, rhs); _MM1.m64_i64 = lhs.m128i_i64[0]; return _MM1; }
void ulsch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms, int **rxdataF_comp, short *ulsch_llr, int **ul_ch_mag, unsigned char symbol, unsigned short nb_rb) { __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; __m128i *ch_mag; int i; // unsigned char symbol_mod; // printf("ulsch_rx.c: ulsch_16qam_llr: symbol %d\n",symbol); if (symbol == 0) llr128U = (__m128i*)&ulsch_llr[0]; // symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol; ch_mag =(__m128i*)&ul_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)]; for (i=0;i<(nb_rb*3);i++) { mmtmpU0 = _mm_abs_epi16(rxF[i]); // print_shorts("tmp0",&tmp0); mmtmpU0 = _mm_subs_epi16(mmtmpU0,ch_mag[i]); llr128U[0] = _mm_unpacklo_epi16(rxF[i],mmtmpU0); llr128U[1] = _mm_unpackhi_epi16(rxF[i],mmtmpU0); llr128U+=2; // print_bytes("rxF[i]",&rxF[i]); // print_bytes("rxF[i+1]",&rxF[i+1]); } _mm_empty(); _m_empty(); }
SIMD_INLINE __m128i AdjustedYuvToHue16(__m128i y, __m128i u, __m128i v, const __m128 & KF_255_DIV_6) { const __m128i red = AdjustedYuvToRed16(y, v); const __m128i green = AdjustedYuvToGreen16(y, u, v); const __m128i blue = AdjustedYuvToBlue16(y, u); const __m128i max = MaxI16(red, green, blue); const __m128i range = _mm_subs_epi16(max, MinI16(red, green, blue)); const __m128i redMaxMask = _mm_cmpeq_epi16(red, max); const __m128i greenMaxMask = _mm_andnot_si128(redMaxMask, _mm_cmpeq_epi16(green, max)); const __m128i blueMaxMask = _mm_andnot_si128(redMaxMask, _mm_andnot_si128(greenMaxMask, K_INV_ZERO)); const __m128i redMaxCase = _mm_and_si128(redMaxMask, _mm_add_epi16(_mm_sub_epi16(green, blue), _mm_mullo_epi16(range, K16_0006))); const __m128i greenMaxCase = _mm_and_si128(greenMaxMask, _mm_add_epi16(_mm_sub_epi16(blue, red), _mm_mullo_epi16(range, K16_0002))); const __m128i blueMaxCase = _mm_and_si128(blueMaxMask, _mm_add_epi16(_mm_sub_epi16(red, green), _mm_mullo_epi16(range, K16_0004))); const __m128i dividend = _mm_or_si128(_mm_or_si128(redMaxCase, greenMaxCase), blueMaxCase); return _mm_andnot_si128(_mm_cmpeq_epi16(range, K_ZERO), _mm_and_si128(MulDiv16(dividend, range, KF_255_DIV_6), K16_00FF)); }
void CColorAdjustment::ProcessY(int Width, int Height, uint8_t *pData, int Pitch) { if (m_Brightness != 0 || m_Contrast != 0) { if (m_fUpdateYTable) { MakeYTable(m_YTable, m_Brightness, m_Contrast); m_fUpdateYTable = false; } #ifdef TVTVIDEODEC_SSE2_SUPPORT const bool fSSE2 = IsSSE2Enabled(); #endif for (int y = 0; y < Height; y++) { uint8_t *p = pData; int x = 0; #ifdef TVTVIDEODEC_SSE2_SUPPORT if (fSSE2 && !((uintptr_t)p & 15)) { const short c = (short)(min((m_Contrast * 512 / 100) + 512, (1 << 16) - 1)); const short b = (short)((m_Brightness * 255 / 100) + 16); const __m128i bc = _mm_set_epi16(b, c, b, c, b, c, b, c); const __m128i zero = _mm_setzero_si128(); const __m128i w16 = _mm_set1_epi16(16); const __m128i w512 = _mm_set1_epi16(512); for (; x + 16 <= Width; x += 16) { __m128i r = _mm_load_si128((const __m128i*)p); __m128i rl = _mm_unpacklo_epi8(r, zero); __m128i rh = _mm_unpackhi_epi8(r, zero); rl = _mm_subs_epi16(rl, w16); rh = _mm_subs_epi16(rh, w16); __m128i rll = _mm_unpacklo_epi16(rl, w512); __m128i rlh = _mm_unpackhi_epi16(rl, w512); __m128i rhl = _mm_unpacklo_epi16(rh, w512); __m128i rhh = _mm_unpackhi_epi16(rh, w512); rll = _mm_madd_epi16(rll, bc); rlh = _mm_madd_epi16(rlh, bc); rhl = _mm_madd_epi16(rhl, bc); rhh = _mm_madd_epi16(rhh, bc); rll = _mm_srai_epi32(rll, 9); rlh = _mm_srai_epi32(rlh, 9); rhl = _mm_srai_epi32(rhl, 9); rhh = _mm_srai_epi32(rhh, 9); rl = _mm_packs_epi32(rll, rlh); rh = _mm_packs_epi32(rhl, rhh); r = _mm_packus_epi16(rl, rh); _mm_store_si128((__m128i*)p, r); p += 16; } } #endif for (; x < Width; x++) { *p = m_YTable[*p]; p++; } pData += Pitch; } } }
pstatus_t sse2_alphaComp_argb( const BYTE* pSrc1, UINT32 src1Step, const BYTE* pSrc2, UINT32 src2Step, BYTE* pDst, UINT32 dstStep, UINT32 width, UINT32 height) { const UINT32* sptr1 = (const UINT32*) pSrc1; const UINT32* sptr2 = (const UINT32*) pSrc2; UINT32* dptr; int linebytes, src1Jump, src2Jump, dstJump; UINT32 y; __m128i xmm0, xmm1; if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS; if (width < 4) /* pointless if too small */ { return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width, height); } dptr = (UINT32*) pDst; linebytes = width * sizeof(UINT32); src1Jump = (src1Step - linebytes) / sizeof(UINT32); src2Jump = (src2Step - linebytes) / sizeof(UINT32); dstJump = (dstStep - linebytes) / sizeof(UINT32); xmm0 = _mm_set1_epi32(0); xmm1 = _mm_set1_epi16(1); for (y = 0; y < height; ++y) { int pixels = width; int count; /* Get to the 16-byte boundary now. */ int leadIn = 0; switch ((ULONG_PTR) dptr & 0x0f) { case 0: leadIn = 0; break; case 4: leadIn = 3; break; case 8: leadIn = 2; break; case 12: leadIn = 1; break; default: /* We'll never hit a 16-byte boundary, so do the whole * thing the slow way. */ leadIn = width; break; } if (leadIn) { pstatus_t status; status = generic->alphaComp_argb((const BYTE*) sptr1, src1Step, (const BYTE*) sptr2, src2Step, (BYTE*) dptr, dstStep, leadIn, 1); if (status != PRIMITIVES_SUCCESS) return status; sptr1 += leadIn; sptr2 += leadIn; dptr += leadIn; pixels -= leadIn; } /* Use SSE registers to do 4 pixels at a time. */ count = pixels >> 2; pixels -= count << 2; while (count--) { __m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; /* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */ xmm2 = LOAD_SI128(sptr1); sptr1 += 4; /* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */ xmm3 = LOAD_SI128(sptr2); sptr2 += 4; /* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */ xmm4 = _mm_unpackhi_epi8(xmm2, xmm0); /* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */ xmm5 = _mm_unpackhi_epi8(xmm3, xmm0); /* subtract */ xmm6 = _mm_subs_epi16(xmm4, xmm5); /* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */ xmm4 = _mm_shufflelo_epi16(xmm4, 0xff); /* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */ xmm4 = _mm_shufflehi_epi16(xmm4, 0xff); /* Add one to alphas */ xmm4 = _mm_adds_epi16(xmm4, xmm1); /* Multiply and take low word */ xmm4 = _mm_mullo_epi16(xmm4, xmm6); /* Shift 8 right */ xmm4 = _mm_srai_epi16(xmm4, 8); /* Add xmm5 */ xmm4 = _mm_adds_epi16(xmm4, xmm5); /* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */ /* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */ xmm5 = _mm_unpacklo_epi8(xmm2, xmm0); /* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */ xmm6 = _mm_unpacklo_epi8(xmm3, xmm0); /* subtract */ xmm7 = _mm_subs_epi16(xmm5, xmm6); /* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */ xmm5 = _mm_shufflelo_epi16(xmm5, 0xff); /* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */ xmm5 = _mm_shufflehi_epi16(xmm5, 0xff); /* Add one to alphas */ xmm5 = _mm_adds_epi16(xmm5, xmm1); /* Multiply and take low word */ xmm5 = _mm_mullo_epi16(xmm5, xmm7); /* Shift 8 right */ xmm5 = _mm_srai_epi16(xmm5, 8); /* Add xmm6 */ xmm5 = _mm_adds_epi16(xmm5, xmm6); /* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */ /* Must mask off remainders or pack gets confused */ xmm3 = _mm_set1_epi16(0x00ffU); xmm4 = _mm_and_si128(xmm4, xmm3); xmm5 = _mm_and_si128(xmm5, xmm3); /* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */ xmm5 = _mm_packus_epi16(xmm5, xmm4); _mm_store_si128((__m128i*) dptr, xmm5); dptr += 4; } /* Finish off the remainder. */ if (pixels) { pstatus_t status; status = generic->alphaComp_argb((const BYTE*) sptr1, src1Step, (const BYTE*) sptr2, src2Step, (BYTE*) dptr, dstStep, pixels, 1); if (status != PRIMITIVES_SUCCESS) return status; sptr1 += pixels; sptr2 += pixels; dptr += pixels; } /* Jump to next row. */ sptr1 += src1Jump; sptr2 += src2Jump; dptr += dstJump; } return PRIMITIVES_SUCCESS; }
pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi) { int lastRow, lastCol; BYTE *UData,*VData,*YData; int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV; __m128i r0,r1,r2,r3,r4,r5,r6,r7; __m128i *buffer; /* last_line: if the last (U,V doubled) line should be skipped, set to 10B * last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */ buffer = _aligned_malloc(4 * 16, 16); YData = (BYTE*) pSrc[0]; UData = (BYTE*) pSrc[1]; VData = (BYTE*) pSrc[2]; nWidth = roi->width; nHeight = roi->height; if ((lastCol = (nWidth & 3))) { switch (lastCol) { case 1: r7 = _mm_set_epi32(0,0,0,0xFFFFFFFF); break; case 2: r7 = _mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF); break; case 3: r7 = _mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF); break; } _mm_store_si128(buffer+3,r7); lastCol = 1; } nWidth += 3; nWidth = nWidth >> 2; lastRow = nHeight & 1; nHeight++; nHeight = nHeight >> 1; VaddDst = (dstStep << 1) - (nWidth << 4); VaddY = (srcStep[0] << 1) - (nWidth << 2); VaddU = srcStep[1] - (((nWidth << 1) + 2) & 0xFFFC); VaddV = srcStep[2] - (((nWidth << 1) + 2) & 0xFFFC); while (nHeight-- > 0) { if (nHeight == 0) lastRow <<= 1; i = 0; do { if (!(i & 0x01)) { /* Y-, U- and V-data is stored in different arrays. * We start with processing U-data. * * at first we fetch four U-values from its array and shuffle them like this: * 0d0d 0c0c 0b0b 0a0a * we've done two things: converting the values to signed words and duplicating * each value, because always two pixel "share" the same U- (and V-) data */ r0 = _mm_cvtsi32_si128(*(UINT32 *)UData); r5 = _mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000); r0 = _mm_shuffle_epi8(r0,r5); UData += 4; /* then we subtract 128 from each value, so we get D */ r3 = _mm_set_epi16(128,128,128,128,128,128,128,128); r0 = _mm_subs_epi16(r0,r3); /* we need to do two things with our D, so let's store it for later use */ r2 = r0; /* now we can multiply our D with 48 and unpack it to xmm4:xmm0 * this is what we need to get G data later on */ r4 = r0; r7 = _mm_set_epi16(48,48,48,48,48,48,48,48); r0 = _mm_mullo_epi16(r0,r7); r4 = _mm_mulhi_epi16(r4,r7); r7 = r0; r0 = _mm_unpacklo_epi16(r0,r4); r4 = _mm_unpackhi_epi16(r7,r4); /* to get B data, we need to prepare a second value, D*475 */ r1 = r2; r7 = _mm_set_epi16(475,475,475,475,475,475,475,475); r1 = _mm_mullo_epi16(r1,r7); r2 = _mm_mulhi_epi16(r2,r7); r7 = r1; r1 = _mm_unpacklo_epi16(r1,r2); r7 = _mm_unpackhi_epi16(r7,r2); /* so we got something like this: xmm7:xmm1 * this pair contains values for 16 pixel: * aabbccdd * aabbccdd, but we can only work on four pixel at once, so we need to save upper values */ _mm_store_si128(buffer+1,r7); /* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */ r2 = _mm_cvtsi32_si128(*(UINT32 *)VData); r2 = _mm_shuffle_epi8(r2,r5); VData += 4; r2 = _mm_subs_epi16(r2,r3); r5 = r2; /* this is also known as E*403, we need it to convert R data */ r3 = r2; r7 = _mm_set_epi16(403,403,403,403,403,403,403,403); r2 = _mm_mullo_epi16(r2,r7); r3 = _mm_mulhi_epi16(r3,r7); r7 = r2; r2 = _mm_unpacklo_epi16(r2,r3); r7 = _mm_unpackhi_epi16(r7,r3); /* and preserve upper four values for future ... */ _mm_store_si128(buffer+2,r7); /* doing this step: E*120 */ r3 = r5; r7 = _mm_set_epi16(120,120,120,120,120,120,120,120); r3 = _mm_mullo_epi16(r3,r7); r5 = _mm_mulhi_epi16(r5,r7); r7 = r3; r3 = _mm_unpacklo_epi16(r3,r5); r7 = _mm_unpackhi_epi16(r7,r5); /* now we complete what we've begun above: * (48*D) + (120*E) = (48*D +120*E) */ r0 = _mm_add_epi32(r0,r3); r4 = _mm_add_epi32(r4,r7); /* and store to memory ! */ _mm_store_si128(buffer,r4); } else { /* maybe you've wondered about the conditional above ? * Well, we prepared UV data for eight pixel in each line, but can only process four * per loop. So we need to load the upper four pixel data from memory each secound loop! */ r1 = _mm_load_si128(buffer+1); r2 = _mm_load_si128(buffer+2); r0 = _mm_load_si128(buffer); } if (++i == nWidth) lastCol <<= 1; /* We didn't produce any output yet, so let's do so! * Ok, fetch four pixel from the Y-data array and shuffle them like this: * 00d0 00c0 00b0 00a0, to get signed dwords and multiply by 256 */ r4 = _mm_cvtsi32_si128(*(UINT32 *)YData); r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); r4 = _mm_shuffle_epi8(r4,r7); r5 = r4; r6 = r4; /* no we can perform the "real" conversion itself and produce output! */ r4 = _mm_add_epi32(r4,r2); r5 = _mm_sub_epi32(r5,r0); r6 = _mm_add_epi32(r6,r1); /* in the end, we only need bytes for RGB values. * So, what do we do? right! shifting left makes values bigger and thats always good. * before we had dwords of data, and by shifting left and treating the result * as packed words, we get not only signed words, but do also divide by 256 * imagine, data is now ordered this way: ddx0 ccx0 bbx0 aax0, and x is the least * significant byte, that we don't need anymore, because we've done some rounding */ r4 = _mm_slli_epi32(r4,8); r5 = _mm_slli_epi32(r5,8); r6 = _mm_slli_epi32(r6,8); /* one thing we still have to face is the clip() function ... * we have still signed words, and there are those min/max instructions in SSE2 ... * the max instruction takes always the bigger of the two operands and stores it in the first one, * and it operates with signs ! * if we feed it with our values and zeros, it takes the zeros if our values are smaller than * zero and otherwise our values */ r7 = _mm_set_epi32(0,0,0,0); r4 = _mm_max_epi16(r4,r7); r5 = _mm_max_epi16(r5,r7); r6 = _mm_max_epi16(r6,r7); /* the same thing just completely different can be used to limit our values to 255, * but now using the min instruction and 255s */ r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_min_epi16(r4,r7); r5 = _mm_min_epi16(r5,r7); r6 = _mm_min_epi16(r6,r7); /* Now we got our bytes. * the moment has come to assemble the three channels R,G and B to the xrgb dwords * on Red channel we just have to and each futural dword with 00FF0000H */ //r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_and_si128(r4,r7); /* on Green channel we have to shuffle somehow, so we get something like this: * 00d0 00c0 00b0 00a0 */ r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); r5 = _mm_shuffle_epi8(r5,r7); /* and on Blue channel that one: * 000d 000c 000b 000a */ r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); r6 = _mm_shuffle_epi8(r6,r7); /* and at last we or it together and get this one: * xrgb xrgb xrgb xrgb */ r4 = _mm_or_si128(r4,r5); r4 = _mm_or_si128(r4,r6); /* Only thing to do know is writing data to memory, but this gets a bit more * complicated if the width is not a multiple of four and it is the last column in line. */ if (lastCol & 0x02) { /* let's say, we need to only convert six pixel in width * Ok, the first 4 pixel will be converted just like every 4 pixel else, but * if it's the last loop in line, last_column is shifted left by one (curious? have a look above), * and we land here. Through initialisation a mask was prepared. In this case it looks like * 0000FFFFH 0000FFFFH 0000FFFFH 0000FFFFH */ r6 = _mm_load_si128(buffer+3); /* we and our output data with this mask to get only the valid pixel */ r4 = _mm_and_si128(r4,r6); /* then we fetch memory from the destination array ... */ r5 = _mm_lddqu_si128((__m128i *)pDst); /* ... and and it with the inverse mask. We get only those pixel, which should not be updated */ r6 = _mm_andnot_si128(r6,r5); /* we only have to or the two values together and write it back to the destination array, * and only the pixel that should be updated really get changed. */ r4 = _mm_or_si128(r4,r6); } _mm_storeu_si128((__m128i *)pDst,r4); if (!(lastRow & 0x02)) { /* Because UV data is the same for two lines, we can process the secound line just here, * in the same loop. Only thing we need to do is to add some offsets to the Y- and destination * pointer. These offsets are iStride[0] and the target scanline. * But if we don't need to process the secound line, like if we are in the last line of processing nine lines, * we just skip all this. */ r4 = _mm_cvtsi32_si128(*(UINT32 *)(YData+srcStep[0])); r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); r4 = _mm_shuffle_epi8(r4,r7); r5 = r4; r6 = r4; r4 = _mm_add_epi32(r4,r2); r5 = _mm_sub_epi32(r5,r0); r6 = _mm_add_epi32(r6,r1); r4 = _mm_slli_epi32(r4,8); r5 = _mm_slli_epi32(r5,8); r6 = _mm_slli_epi32(r6,8); r7 = _mm_set_epi32(0,0,0,0); r4 = _mm_max_epi16(r4,r7); r5 = _mm_max_epi16(r5,r7); r6 = _mm_max_epi16(r6,r7); r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_min_epi16(r4,r7); r5 = _mm_min_epi16(r5,r7); r6 = _mm_min_epi16(r6,r7); r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_and_si128(r4,r7); r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); r5 = _mm_shuffle_epi8(r5,r7); r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); r6 = _mm_shuffle_epi8(r6,r7); r4 = _mm_or_si128(r4,r5); r4 = _mm_or_si128(r4,r6); if (lastCol & 0x02) { r6 = _mm_load_si128(buffer+3); r4 = _mm_and_si128(r4,r6); r5 = _mm_lddqu_si128((__m128i *)(pDst+dstStep)); r6 = _mm_andnot_si128(r6,r5); r4 = _mm_or_si128(r4,r6); /* only thing is, we should shift [rbp-42] back here, because we have processed the last column, * and this "special condition" can be released */ lastCol >>= 1; } _mm_storeu_si128((__m128i *)(pDst+dstStep),r4); } /* after all we have to increase the destination- and Y-data pointer by four pixel */ pDst += 16; YData += 4; }
void SGMStereo::calcRowCosts(unsigned char*& leftSobelRow, int*& leftCensusRow, unsigned char*& rightSobelRow, int*& rightCensusRow, unsigned short* costImageRow) { const int widthStepCost = width_*disparityTotal_; const __m128i registerZero = _mm_setzero_si128(); for (int y = 1; y < height_; ++y) { int addRowIndex = y + aggregationWindowRadius_; int addRowAggregatedCostIndex = std::min(addRowIndex, height_ - 1)%(aggregationWindowRadius_*2 + 2); unsigned short* addRowAggregatedCost = rowAggregatedCost_ + width_*disparityTotal_*addRowAggregatedCostIndex; if (addRowIndex < height_) { calcPixelwiseSAD(leftSobelRow, rightSobelRow); addPixelwiseHamming(leftCensusRow, rightCensusRow); memset(addRowAggregatedCost, 0, disparityTotal_*sizeof(unsigned short)); // x = 0 for (int x = 0; x <= aggregationWindowRadius_; ++x) { int scale = x == 0 ? aggregationWindowRadius_ + 1 : 1; for (int d = 0; d < disparityTotal_; ++d) { addRowAggregatedCost[d] += static_cast<unsigned short>(pixelwiseCostRow_[disparityTotal_*x + d]*scale); } } // x = 1...width-1 int subRowAggregatedCostIndex = std::max(y - aggregationWindowRadius_ - 1, 0)%(aggregationWindowRadius_*2 + 2); const unsigned short* subRowAggregatedCost = rowAggregatedCost_ + width_*disparityTotal_*subRowAggregatedCostIndex; const unsigned short* previousCostRow = costImageRow - widthStepCost; for (int x = 1; x < width_; ++x) { const unsigned char* addPixelwiseCost = pixelwiseCostRow_ + std::min((x + aggregationWindowRadius_)*disparityTotal_, (width_ - 1)*disparityTotal_); const unsigned char* subPixelwiseCost = pixelwiseCostRow_ + std::max((x - aggregationWindowRadius_ - 1)*disparityTotal_, 0); for (int d = 0; d < disparityTotal_; d += 16) { __m128i registerAddPixelwiseLow = _mm_load_si128(reinterpret_cast<const __m128i*>(addPixelwiseCost + d)); __m128i registerAddPixelwiseHigh = _mm_unpackhi_epi8(registerAddPixelwiseLow, registerZero); registerAddPixelwiseLow = _mm_unpacklo_epi8(registerAddPixelwiseLow, registerZero); __m128i registerSubPixelwiseLow = _mm_load_si128(reinterpret_cast<const __m128i*>(subPixelwiseCost + d)); __m128i registerSubPixelwiseHigh = _mm_unpackhi_epi8(registerSubPixelwiseLow, registerZero); registerSubPixelwiseLow = _mm_unpacklo_epi8(registerSubPixelwiseLow, registerZero); // Low __m128i registerAddAggregated = _mm_load_si128(reinterpret_cast<const __m128i*>(addRowAggregatedCost + disparityTotal_*(x - 1) + d)); registerAddAggregated = _mm_adds_epi16(_mm_subs_epi16(registerAddAggregated, registerSubPixelwiseLow), registerAddPixelwiseLow); __m128i registerCost = _mm_load_si128(reinterpret_cast<const __m128i*>(previousCostRow + disparityTotal_*x + d)); registerCost = _mm_adds_epi16(_mm_subs_epi16(registerCost, _mm_load_si128(reinterpret_cast<const __m128i*>(subRowAggregatedCost + disparityTotal_*x + d))), registerAddAggregated); _mm_store_si128(reinterpret_cast<__m128i*>(addRowAggregatedCost + disparityTotal_*x + d), registerAddAggregated); _mm_store_si128(reinterpret_cast<__m128i*>(costImageRow + disparityTotal_*x + d), registerCost); // High registerAddAggregated = _mm_load_si128(reinterpret_cast<const __m128i*>(addRowAggregatedCost + disparityTotal_*(x-1) + d + 8)); registerAddAggregated = _mm_adds_epi16(_mm_subs_epi16(registerAddAggregated, registerSubPixelwiseHigh), registerAddPixelwiseHigh); registerCost = _mm_load_si128(reinterpret_cast<const __m128i*>(previousCostRow + disparityTotal_*x + d + 8)); registerCost = _mm_adds_epi16(_mm_subs_epi16(registerCost, _mm_load_si128(reinterpret_cast<const __m128i*>(subRowAggregatedCost + disparityTotal_*x + d + 8))), registerAddAggregated); _mm_store_si128(reinterpret_cast<__m128i*>(addRowAggregatedCost + disparityTotal_*x + d + 8), registerAddAggregated); _mm_store_si128(reinterpret_cast<__m128i*>(costImageRow + disparityTotal_*x + d + 8), registerCost); } } } leftSobelRow += widthStep_; rightSobelRow += widthStep_; leftCensusRow += width_; rightCensusRow += width_; costImageRow += widthStepCost; } }
int smith_waterman_sse2_word(const unsigned char * query_sequence, unsigned short * query_profile_word, const int query_length, const unsigned char * db_sequence, const int db_length, unsigned short gap_open, unsigned short gap_extend, struct f_struct * f_str) { int i, j, k; short score; int cmp; int iter = (query_length + 7) / 8; __m128i *p; __m128i *workspace = (__m128i *) f_str->workspace; __m128i E, F, H; __m128i v_maxscore; __m128i v_gapopen; __m128i v_gapextend; __m128i v_min; __m128i v_minimums; __m128i v_temp; __m128i *pHLoad, *pHStore; __m128i *pE; __m128i *pScore; /* Load gap opening penalty to all elements of a constant */ v_gapopen = _mm_setzero_si128(); /* Apple Devel */ v_gapopen = _mm_insert_epi16 (v_gapopen, gap_open, 0); v_gapopen = _mm_shufflelo_epi16 (v_gapopen, 0); v_gapopen = _mm_shuffle_epi32 (v_gapopen, 0); /* Load gap extension penalty to all elements of a constant */ v_gapextend = _mm_setzero_si128(); /* Apple Devel */ v_gapextend = _mm_insert_epi16 (v_gapextend, gap_extend, 0); v_gapextend = _mm_shufflelo_epi16 (v_gapextend, 0); v_gapextend = _mm_shuffle_epi32 (v_gapextend, 0); /* load v_maxscore with the zeros. since we are using signed */ /* math, we will bias the maxscore to -32768 so we have the */ /* full range of the short. */ v_maxscore = _mm_setzero_si128(); /* Apple Devel */ v_maxscore = _mm_cmpeq_epi16 (v_maxscore, v_maxscore); v_maxscore = _mm_slli_epi16 (v_maxscore, 15); v_minimums = _mm_shuffle_epi32 (v_maxscore, 0); v_min = _mm_shuffle_epi32 (v_maxscore, 0); v_min = _mm_srli_si128 (v_min, 14); /* Zero out the storage vector */ k = 2 * iter; p = workspace; for (i = 0; i < k; i++) { _mm_store_si128 (p++, v_maxscore); } pE = workspace; pHStore = pE + iter; pHLoad = pHStore + iter; for (i = 0; i < db_length; ++i) { /* fetch first data asap. */ pScore = (__m128i *) query_profile_word + db_sequence[i] * iter; /* bias all elements in F to -32768 */ F = _mm_setzero_si128(); /* Apple Devel */ F = _mm_cmpeq_epi16 (F, F); F = _mm_slli_epi16 (F, 15); /* load the next h value */ H = _mm_load_si128 (pHStore + iter - 1); H = _mm_slli_si128 (H, 2); H = _mm_or_si128 (H, v_min); p = pHLoad; pHLoad = pHStore; pHStore = p; for (j = 0; j < iter; j++) { /* load E values */ E = _mm_load_si128 (pE + j); /* add score to H */ H = _mm_adds_epi16 (H, *pScore++); /* Update highest score encountered this far */ v_maxscore = _mm_max_epi16 (v_maxscore, H); /* get max from H, E and F */ H = _mm_max_epi16 (H, E); H = _mm_max_epi16 (H, F); /* save H values */ _mm_store_si128 (pHStore + j, H); /* subtract the gap open penalty from H */ H = _mm_subs_epi16 (H, v_gapopen); /* update E value */ E = _mm_subs_epi16 (E, v_gapextend); E = _mm_max_epi16 (E, H); /* update F value */ F = _mm_subs_epi16 (F, v_gapextend); F = _mm_max_epi16 (F, H); /* save E values */ _mm_store_si128 (pE + j, E); /* load the next h value */ H = _mm_load_si128 (pHLoad + j); } /* reset pointers to the start of the saved data */ j = 0; H = _mm_load_si128 (pHStore + j); /* the computed F value is for the given column. since */ /* we are at the end, we need to shift the F value over */ /* to the next column. */ F = _mm_slli_si128 (F, 2); F = _mm_or_si128 (F, v_min); v_temp = _mm_subs_epi16 (H, v_gapopen); v_temp = _mm_cmpgt_epi16 (F, v_temp); cmp = _mm_movemask_epi8 (v_temp); while (cmp != 0x0000) { E = _mm_load_si128 (pE + j); H = _mm_max_epi16 (H, F); /* save H values */ _mm_store_si128 (pHStore + j, H); /* update E in case the new H value would change it */ H = _mm_subs_epi16 (H, v_gapopen); E = _mm_max_epi16 (E, H); _mm_store_si128 (pE + j, E); /* update F value */ F = _mm_subs_epi16 (F, v_gapextend); j++; if (j >= iter) { j = 0; F = _mm_slli_si128 (F, 2); F = _mm_or_si128 (F, v_min); } H = _mm_load_si128 (pHStore + j); v_temp = _mm_subs_epi16 (H, v_gapopen); v_temp = _mm_cmpgt_epi16 (F, v_temp); cmp = _mm_movemask_epi8 (v_temp); } } /* find largest score in the v_maxscore vector */ v_temp = _mm_srli_si128 (v_maxscore, 8); v_maxscore = _mm_max_epi16 (v_maxscore, v_temp); v_temp = _mm_srli_si128 (v_maxscore, 4); v_maxscore = _mm_max_epi16 (v_maxscore, v_temp); v_temp = _mm_srli_si128 (v_maxscore, 2); v_maxscore = _mm_max_epi16 (v_maxscore, v_temp); /* extract the largest score */ score = _mm_extract_epi16 (v_maxscore, 0); /* return largest score biased by 32768 */ /* fix for Mac OSX clang 4.1 */ /* #ifdef __clang__ if (score < 0) score += 32768; return score; #else */ return score + 32768; /* #endif */ }
void SGMStereo::performSGM(unsigned short* costImage, unsigned short* disparityImage) { const short costMax = SHRT_MAX; int widthStepCostImage = width_*disparityTotal_; short* costSums = sgmBuffer_; memset(costSums, 0, costSumBufferSize_*sizeof(short)); short** pathCosts = new short*[pathRowBufferTotal_]; short** pathMinCosts = new short*[pathRowBufferTotal_]; const int processPassTotal = 2; for (int processPassCount = 0; processPassCount < processPassTotal; ++processPassCount) { int startX, endX, stepX; int startY, endY, stepY; if (processPassCount == 0) { startX = 0; endX = width_; stepX = 1; startY = 0; endY = height_; stepY = 1; } else { startX = width_ - 1; endX = -1; stepX = -1; startY = height_ - 1; endY = -1; stepY = -1; } for (int i = 0; i < pathRowBufferTotal_; ++i) { pathCosts[i] = costSums + costSumBufferSize_ + pathCostBufferSize_*i + pathDisparitySize_ + 8; memset(pathCosts[i] - pathDisparitySize_ - 8, 0, pathCostBufferSize_*sizeof(short)); pathMinCosts[i] = costSums + costSumBufferSize_ + pathCostBufferSize_*pathRowBufferTotal_ + pathMinCostBufferSize_*i + pathTotal_*2; memset(pathMinCosts[i] - pathTotal_, 0, pathMinCostBufferSize_*sizeof(short)); } for (int y = startY; y != endY; y += stepY) { unsigned short* pixelCostRow = costImage + widthStepCostImage*y; short* costSumRow = costSums + costSumBufferRowSize_*y; memset(pathCosts[0] - pathDisparitySize_ - 8, 0, pathDisparitySize_*sizeof(short)); memset(pathCosts[0] + width_*pathDisparitySize_ - 8, 0, pathDisparitySize_*sizeof(short)); memset(pathMinCosts[0] - pathTotal_, 0, pathTotal_*sizeof(short)); memset(pathMinCosts[0] + width_*pathTotal_, 0, pathTotal_*sizeof(short)); for (int x = startX; x != endX; x += stepX) { int pathMinX = x*pathTotal_; int pathX = pathMinX*disparitySize_; int previousPathMin0 = pathMinCosts[0][pathMinX - stepX*pathTotal_] + smoothnessPenaltyLarge_; int previousPathMin2 = pathMinCosts[1][pathMinX + 2] + smoothnessPenaltyLarge_; short* previousPathCosts0 = pathCosts[0] + pathX - stepX*pathDisparitySize_; short* previousPathCosts2 = pathCosts[1] + pathX + disparitySize_*2; previousPathCosts0[-1] = previousPathCosts0[disparityTotal_] = costMax; previousPathCosts2[-1] = previousPathCosts2[disparityTotal_] = costMax; short* pathCostCurrent = pathCosts[0] + pathX; const unsigned short* pixelCostCurrent = pixelCostRow + disparityTotal_*x; short* costSumCurrent = costSumRow + disparityTotal_*x; __m128i regPenaltySmall = _mm_set1_epi16(static_cast<short>(smoothnessPenaltySmall_)); __m128i regPathMin0, regPathMin2; regPathMin0 = _mm_set1_epi16(static_cast<short>(previousPathMin0)); regPathMin2 = _mm_set1_epi16(static_cast<short>(previousPathMin2)); __m128i regNewPathMin = _mm_set1_epi16(costMax); for (int d = 0; d < disparityTotal_; d += 8) { __m128i regPixelCost = _mm_load_si128(reinterpret_cast<const __m128i*>(pixelCostCurrent + d)); __m128i regPathCost0, regPathCost2; regPathCost0 = _mm_load_si128(reinterpret_cast<const __m128i*>(previousPathCosts0 + d)); regPathCost2 = _mm_load_si128(reinterpret_cast<const __m128i*>(previousPathCosts2 + d)); regPathCost0 = _mm_min_epi16(regPathCost0, _mm_adds_epi16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(previousPathCosts0 + d - 1)), regPenaltySmall)); regPathCost0 = _mm_min_epi16(regPathCost0, _mm_adds_epi16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(previousPathCosts0 + d + 1)), regPenaltySmall)); regPathCost2 = _mm_min_epi16(regPathCost2, _mm_adds_epi16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(previousPathCosts2 + d - 1)), regPenaltySmall)); regPathCost2 = _mm_min_epi16(regPathCost2, _mm_adds_epi16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(previousPathCosts2 + d + 1)), regPenaltySmall)); regPathCost0 = _mm_min_epi16(regPathCost0, regPathMin0); regPathCost0 = _mm_adds_epi16(_mm_subs_epi16(regPathCost0, regPathMin0), regPixelCost); regPathCost2 = _mm_min_epi16(regPathCost2, regPathMin2); regPathCost2 = _mm_adds_epi16(_mm_subs_epi16(regPathCost2, regPathMin2), regPixelCost); _mm_store_si128(reinterpret_cast<__m128i*>(pathCostCurrent + d), regPathCost0); _mm_store_si128(reinterpret_cast<__m128i*>(pathCostCurrent + d + disparitySize_*2), regPathCost2); __m128i regMin02 = _mm_min_epi16(_mm_unpacklo_epi16(regPathCost0, regPathCost2), _mm_unpackhi_epi16(regPathCost0, regPathCost2)); regMin02 = _mm_min_epi16(_mm_unpacklo_epi16(regMin02, regMin02), _mm_unpackhi_epi16(regMin02, regMin02)); regNewPathMin = _mm_min_epi16(regNewPathMin, regMin02); __m128i regCostSum = _mm_load_si128(reinterpret_cast<const __m128i*>(costSumCurrent + d)); regCostSum = _mm_adds_epi16(regCostSum, regPathCost0); regCostSum = _mm_adds_epi16(regCostSum, regPathCost2); _mm_store_si128(reinterpret_cast<__m128i*>(costSumCurrent + d), regCostSum); } regNewPathMin = _mm_min_epi16(regNewPathMin, _mm_srli_si128(regNewPathMin, 8)); _mm_storel_epi64(reinterpret_cast<__m128i*>(&pathMinCosts[0][pathMinX]), regNewPathMin); } if (processPassCount == processPassTotal - 1) { unsigned short* disparityRow = disparityImage + width_*y; for (int x = 0; x < width_; ++x) { short* costSumCurrent = costSumRow + disparityTotal_*x; int bestSumCost = costSumCurrent[0]; int bestDisparity = 0; for (int d = 1; d < disparityTotal_; ++d) { if (costSumCurrent[d] < bestSumCost) { bestSumCost = costSumCurrent[d]; bestDisparity = d; } } if (bestDisparity > 0 && bestDisparity < disparityTotal_ - 1) { int centerCostValue = costSumCurrent[bestDisparity]; int leftCostValue = costSumCurrent[bestDisparity - 1]; int rightCostValue = costSumCurrent[bestDisparity + 1]; if (rightCostValue < leftCostValue) { bestDisparity = static_cast<int>(bestDisparity*disparityFactor_ + static_cast<double>(rightCostValue - leftCostValue)/(centerCostValue - leftCostValue)/2.0*disparityFactor_ + 0.5); } else { bestDisparity = static_cast<int>(bestDisparity*disparityFactor_ + static_cast<double>(rightCostValue - leftCostValue)/(centerCostValue - rightCostValue)/2.0*disparityFactor_ + 0.5); } } else { bestDisparity = static_cast<int>(bestDisparity*disparityFactor_); } disparityRow[x] = static_cast<unsigned short>(bestDisparity); } } std::swap(pathCosts[0], pathCosts[1]); std::swap(pathMinCosts[0], pathMinCosts[1]); } } delete[] pathCosts; delete[] pathMinCosts; speckleFilter(100, static_cast<int>(2*disparityFactor_), disparityImage); }
int global_sse2_word(int queryLength, unsigned short *profile, const unsigned char *dbSeq, int dbLength, unsigned short gapOpen, unsigned short gapExtend, unsigned short ceiling, struct f_struct *f_str) { int i, j; int score; int scale; int temp; int distance; int offset; int position; int cmp; int iter; __m128i *pvH; __m128i *pvE; __m128i vE, vF, vH; __m128i vHNext; __m128i vFPrev; __m128i vGapOpen; __m128i vGapExtend; __m128i vCeiling; __m128i vScale; __m128i vScaleAmt; __m128i vScaleTmp; __m128i vTemp; __m128i vNull; __m128i *pvScore; scale = 0; iter = (queryLength + 7) / 8; offset = (queryLength - 1) % iter; position = 7 - (queryLength - 1) / iter; pvH = (__m128i *)f_str->workspace; pvE = pvH + iter; /* Load gap opening penalty to all elements of a constant */ vGapOpen = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */ vGapOpen = _mm_insert_epi16 (vGapOpen, gapOpen, 0); vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0); vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0); /* Load gap extension penalty to all elements of a constant */ vGapExtend = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */ vGapExtend = _mm_insert_epi16 (vGapExtend, gapExtend, 0); vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0); vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0); /* Generate the ceiling before scaling */ vTemp = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */ vTemp = _mm_insert_epi16 (vTemp, ceiling, 0); vTemp = _mm_shufflelo_epi16 (vTemp, 0); vTemp = _mm_shuffle_epi32 (vTemp, 0); vCeiling = _mm_cmpeq_epi16 (vTemp, vTemp); vCeiling = _mm_srli_epi16 (vCeiling, 1); vCeiling = _mm_subs_epi16 (vCeiling, vTemp); vCeiling = _mm_subs_epi16 (vCeiling, vGapOpen); vNull = _mm_cmpeq_epi16 (vTemp, vTemp); vNull = _mm_slli_epi16 (vNull, 15); vScaleAmt = _mm_xor_si128 (vNull, vNull); /* Zero out the storage vector */ vTemp = _mm_adds_epi16 (vNull, vGapOpen); for (i = 0; i < iter; i++) { _mm_store_si128 (pvH + i, vTemp); _mm_store_si128 (pvE + i, vNull); } /* initialize F */ vF = vNull; vFPrev = vNull; /* load and scale H for the next round */ vTemp = _mm_srli_si128 (vGapOpen, 14); vH = _mm_load_si128 (pvH + iter - 1); vH = _mm_adds_epi16 (vH, vTemp); for (i = 0; i < dbLength; ++i) { /* fetch first data asap. */ pvScore = (__m128i *) profile + dbSeq[i] * iter; vF = vNull; vH = _mm_max_epi16 (vH, vFPrev); for (j = 0; j < iter; j++) { /* correct H from the previous columns F */ vHNext = _mm_load_si128 (pvH + j); vHNext = _mm_max_epi16 (vHNext, vFPrev); /* load and correct E value */ vE = _mm_load_si128 (pvE + j); vTemp = _mm_subs_epi16 (vHNext, vGapOpen); vE = _mm_max_epi16 (vE, vTemp); _mm_store_si128 (pvE + j, vE); /* add score to vH */ vH = _mm_adds_epi16 (vH, *pvScore++); /* get max from vH, vE and vF */ vH = _mm_max_epi16 (vH, vE); vH = _mm_max_epi16 (vH, vF); _mm_store_si128 (pvH + j, vH); /* update vF value */ vH = _mm_subs_epi16 (vH, vGapOpen); vF = _mm_max_epi16 (vF, vH); /* load the next h values */ vH = vHNext; } /* check if we need to scale before the next round */ vTemp = _mm_cmpgt_epi16 (vF, vCeiling); cmp = _mm_movemask_epi8 (vTemp); /* broadcast F values */ vF = _mm_xor_si128 (vF, vNull); vTemp = _mm_slli_si128 (vF, 2); vTemp = _mm_subs_epu16 (vTemp, vScaleAmt); vF = max_epu16 (vF, vTemp); vTemp = _mm_slli_si128 (vF, 4); vScaleTmp = _mm_slli_si128 (vScaleAmt, 2); vScaleTmp = _mm_adds_epu16 (vScaleTmp, vScaleAmt); vTemp = _mm_subs_epu16 (vTemp, vScaleTmp); vF = max_epu16 (vF, vTemp); vTemp = _mm_slli_si128 (vScaleTmp, 4); vScaleTmp = _mm_adds_epu16 (vScaleTmp, vTemp); vTemp = _mm_slli_si128 (vF, 8); vTemp = _mm_subs_epu16 (vTemp, vScaleTmp); vF = max_epu16 (vF, vTemp); /* scale if necessary */ if (cmp != 0x0000) { __m128i vScale1; __m128i vScale2; vScale = _mm_slli_si128 (vF, 2); vScale = _mm_subs_epu16 (vScale, vGapOpen); vScale = _mm_subs_epu16 (vScale, vScaleAmt); vTemp = _mm_slli_si128 (vScale, 2); vTemp = _mm_subs_epu16 (vScale, vTemp); vScaleAmt = _mm_adds_epu16 (vScaleAmt, vTemp); vTemp = _mm_slli_si128 (vScale, 2); vTemp = _mm_subs_epu16 (vTemp, vScale); vScaleAmt = _mm_subs_epu16 (vScaleAmt, vTemp); /* rescale the previous F */ vF = _mm_subs_epu16 (vF, vScale); /* check if we can continue in signed 16-bits */ vTemp = _mm_xor_si128 (vF, vNull); vTemp = _mm_cmpgt_epi16 (vTemp, vCeiling); cmp = _mm_movemask_epi8 (vTemp); if (cmp != 0x0000) { return OVERFLOW_SCORE; } vTemp = _mm_adds_epi16 (vCeiling, vCeiling); vScale1 = _mm_subs_epu16 (vScale, vTemp); vScale2 = _mm_subs_epu16 (vScale, vScale1); /* scale all the vectors */ for (j = 0; j < iter; j++) { /* load H and E */ vH = _mm_load_si128 (pvH + j); vE = _mm_load_si128 (pvE + j); /* get max from vH, vE and vF */ vH = _mm_subs_epi16 (vH, vScale1); vH = _mm_subs_epi16 (vH, vScale2); vE = _mm_subs_epi16 (vE, vScale1); vE = _mm_subs_epi16 (vE, vScale2); /* save the H and E */ _mm_store_si128 (pvH + j, vH); _mm_store_si128 (pvE + j, vE); } vScale = vScaleAmt; for (j = 0; j < position; ++j) { vScale = _mm_slli_si128 (vScale, 2); } /* calculate the final scaling amount */ vTemp = _mm_xor_si128 (vTemp, vTemp); vScale1 = _mm_unpacklo_epi16 (vScale, vTemp); vScale2 = _mm_unpackhi_epi16 (vScale, vTemp); vScale = _mm_add_epi32 (vScale1, vScale2); vTemp = _mm_srli_si128 (vScale, 8); vScale = _mm_add_epi32 (vScale, vTemp); vTemp = _mm_srli_si128 (vScale, 4); vScale = _mm_add_epi32 (vScale, vTemp); scale = (int) (unsigned short) _mm_extract_epi16 (vScale, 0); temp = (int) (unsigned short) _mm_extract_epi16 (vScale, 1); scale = scale + (temp << 16); } /* scale the F value for the next round */ vFPrev = _mm_slli_si128 (vF, 2); vFPrev = _mm_subs_epu16 (vFPrev, vScaleAmt); vFPrev = _mm_xor_si128 (vFPrev, vNull); /* load and scale H for the next round */ vH = _mm_load_si128 (pvH + iter - 1); vH = _mm_xor_si128 (vH, vNull); vH = _mm_slli_si128 (vH, 2); vH = _mm_subs_epu16 (vH, vScaleAmt); vH = _mm_insert_epi16 (vH, gapOpen, 0); vH = _mm_xor_si128 (vH, vNull); } vH = _mm_load_si128 (pvH + offset); vH = _mm_max_epi16 (vH, vFPrev); for (j = 0; j < position; ++j) { vH = _mm_slli_si128 (vH, 2); } score = (int) (signed short) _mm_extract_epi16 (vH, 7); score = score + SHORT_BIAS; /* return largest score */ distance = (queryLength + dbLength) * gapExtend; score = score - (gapOpen * 2) - distance + scale; return score; }
static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; __m128i abs_p1p0; const __m128i thresh = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); const __m128i blimit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); q4p4 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p))); q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); q3p3 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p))); q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); q2p2 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p))); q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); q1p1 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p))); p1q1 = _mm_shuffle_epi32(q1p1, 78); q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); q0p0 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p))); p0q0 = _mm_shuffle_epi32(q0p0, 78); { __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1)); abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); fe = _mm_set1_epi8(0xfe); ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0)); abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1)); flat = _mm_max_epu8(abs_p1p0, abs_q1q0); hev = _mm_subs_epu8(flat, thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(abs_p1p0, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)), _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3))); mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); mask = _mm_subs_epu8(mask, limit); mask = _mm_cmpeq_epi8(mask, zero); } // lp filter { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); const __m128i t80 = _mm_set1_epi8(0x80); const __m128i t1 = _mm_set1_epi16(0x1); __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); __m128i qs0 = _mm_xor_si128(p0q0, t80); __m128i qs1 = _mm_xor_si128(p1q1, t80); __m128i filt; __m128i work_a; __m128i filter1, filter2; __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); work_a = _mm_subs_epi8(qs0, qs0ps0); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); /* (vpx_filter + 3 * (qs0 - ps0)) & mask */ filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi8(filt, t4); filter2 = _mm_adds_epi8(filt, t3); filter1 = _mm_unpacklo_epi8(zero, filter1); filter1 = _mm_srai_epi16(filter1, 0xB); filter2 = _mm_unpacklo_epi8(zero, filter2); filter2 = _mm_srai_epi16(filter2, 0xB); /* Filter1 >> 3 */ filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); /* filt >> 1 */ filt = _mm_adds_epi16(filter1, t1); filt = _mm_srai_epi16(filt, 1); filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt); filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); // loopfilter done { __m128i work; flat = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)), _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3))); flat = _mm_max_epu8(abs_p1p0, flat); flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); q5p5 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p))); q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); q6p6 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p))); flat2 = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)), _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5))); q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); q7p7 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p))); work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)), _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7))); flat2 = _mm_max_epu8(work, flat2); flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); flat2 = _mm_subs_epu8(flat2, one); flat2 = _mm_cmpeq_epi8(flat2, zero); flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // flat and wide flat calculations { const __m128i eight = _mm_set1_epi16(8); const __m128i four = _mm_set1_epi16(4); __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; __m128i pixelFilter_p, pixelFilter_q; __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; p7_16 = _mm_unpacklo_epi8(q7p7, zero); p6_16 = _mm_unpacklo_epi8(q6p6, zero); p5_16 = _mm_unpacklo_epi8(q5p5, zero); p4_16 = _mm_unpacklo_epi8(q4p4, zero); p3_16 = _mm_unpacklo_epi8(q3p3, zero); p2_16 = _mm_unpacklo_epi8(q2p2, zero); p1_16 = _mm_unpacklo_epi8(q1p1, zero); p0_16 = _mm_unpacklo_epi8(q0p0, zero); q0_16 = _mm_unpackhi_epi8(q0p0, zero); q1_16 = _mm_unpackhi_epi8(q1p1, zero); q2_16 = _mm_unpackhi_epi8(q2p2, zero); q3_16 = _mm_unpackhi_epi8(q3p3, zero); q4_16 = _mm_unpackhi_epi8(q4p4, zero); q5_16 = _mm_unpackhi_epi8(q5p5, zero); q6_16 = _mm_unpackhi_epi8(q6p6, zero); q7_16 = _mm_unpackhi_epi8(q7p7, zero); pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), _mm_add_epi16(p4_16, p3_16)); pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), _mm_add_epi16(q4_16, q3_16)); pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); pixetFilter_p2p1p0 = _mm_add_epi16( four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); res_p = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4); res_q = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4); flat2_q0p0 = _mm_packus_epi16(res_p, res_q); res_p = _mm_srli_epi16( _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); res_q = _mm_srli_epi16( _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); flat_q0p0 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(p7_16, p7_16); sum_q7 = _mm_add_epi16(q7_16, q7_16); sum_p3 = _mm_add_epi16(p3_16, p3_16); sum_q3 = _mm_add_epi16(q3_16, q3_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4); res_q = _mm_srli_epi16( _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4); flat2_q1p1 = _mm_packus_epi16(res_p, res_q); pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); res_q = _mm_srli_epi16( _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); flat_q1p1 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(sum_p7, p7_16); sum_q7 = _mm_add_epi16(sum_q7, q7_16); sum_p3 = _mm_add_epi16(sum_p3, p3_16); sum_q3 = _mm_add_epi16(sum_q3, q3_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4); res_q = _mm_srli_epi16( _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4); flat2_q2p2 = _mm_packus_epi16(res_p, res_q); pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); res_q = _mm_srli_epi16( _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); flat_q2p2 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(sum_p7, p7_16); sum_q7 = _mm_add_epi16(sum_q7, q7_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4); res_q = _mm_srli_epi16( _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4); flat2_q3p3 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(sum_p7, p7_16); sum_q7 = _mm_add_epi16(sum_q7, q7_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4); res_q = _mm_srli_epi16( _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4); flat2_q4p4 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(sum_p7, p7_16); sum_q7 = _mm_add_epi16(sum_q7, q7_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4); res_q = _mm_srli_epi16( _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4); flat2_q5p5 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(sum_p7, p7_16); sum_q7 = _mm_add_epi16(sum_q7, q7_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4); res_q = _mm_srli_epi16( _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4); flat2_q6p6 = _mm_packus_epi16(res_p, res_q); } // wide flat // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat = _mm_shuffle_epi32(flat, 68); flat2 = _mm_shuffle_epi32(flat2, 68); q2p2 = _mm_andnot_si128(flat, q2p2); flat_q2p2 = _mm_and_si128(flat, flat_q2p2); q2p2 = _mm_or_si128(q2p2, flat_q2p2); qs1ps1 = _mm_andnot_si128(flat, qs1ps1); flat_q1p1 = _mm_and_si128(flat, flat_q1p1); q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); qs0ps0 = _mm_andnot_si128(flat, qs0ps0); flat_q0p0 = _mm_and_si128(flat, flat_q0p0); q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); q6p6 = _mm_andnot_si128(flat2, q6p6); flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); q6p6 = _mm_or_si128(q6p6, flat2_q6p6); _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); q5p5 = _mm_andnot_si128(flat2, q5p5); flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); q5p5 = _mm_or_si128(q5p5, flat2_q5p5); _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); q4p4 = _mm_andnot_si128(flat2, q4p4); flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); q4p4 = _mm_or_si128(q4p4, flat2_q4p4); _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); q3p3 = _mm_andnot_si128(flat2, q3p3); flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); q3p3 = _mm_or_si128(q3p3, flat2_q3p3); _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); q2p2 = _mm_andnot_si128(flat2, q2p2); flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); q2p2 = _mm_or_si128(q2p2, flat2_q2p2); _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); q1p1 = _mm_andnot_si128(flat2, q1p1); flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); q1p1 = _mm_or_si128(q1p1, flat2_q1p1); _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); q0p0 = _mm_andnot_si128(flat2, q0p0); flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); q0p0 = _mm_or_si128(q0p0, flat2_q0p0); _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); } }