Beispiel #1
0
static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  const __m128i zero = _mm_setzero_si128();

  // Load values. Note that we read 8 pixels instead of 4,
  // but the a/b buffers are over-allocated to that effect.
  const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[BPS * 0]);
  const __m128i a1 = _mm_loadl_epi64((const __m128i*)&a[BPS * 1]);
  const __m128i a2 = _mm_loadl_epi64((const __m128i*)&a[BPS * 2]);
  const __m128i a3 = _mm_loadl_epi64((const __m128i*)&a[BPS * 3]);
  const __m128i b0 = _mm_loadl_epi64((const __m128i*)&b[BPS * 0]);
  const __m128i b1 = _mm_loadl_epi64((const __m128i*)&b[BPS * 1]);
  const __m128i b2 = _mm_loadl_epi64((const __m128i*)&b[BPS * 2]);
  const __m128i b3 = _mm_loadl_epi64((const __m128i*)&b[BPS * 3]);
  // Combine pair of lines.
  const __m128i a01 = _mm_unpacklo_epi32(a0, a1);
  const __m128i a23 = _mm_unpacklo_epi32(a2, a3);
  const __m128i b01 = _mm_unpacklo_epi32(b0, b1);
  const __m128i b23 = _mm_unpacklo_epi32(b2, b3);
  // Convert to 16b.
  const __m128i a01s = _mm_unpacklo_epi8(a01, zero);
  const __m128i a23s = _mm_unpacklo_epi8(a23, zero);
  const __m128i b01s = _mm_unpacklo_epi8(b01, zero);
  const __m128i b23s = _mm_unpacklo_epi8(b23, zero);
  // subtract, square and accumulate
  const __m128i d0 = _mm_subs_epi16(a01s, b01s);
  const __m128i d1 = _mm_subs_epi16(a23s, b23s);
  const __m128i e0 = _mm_madd_epi16(d0, d0);
  const __m128i e1 = _mm_madd_epi16(d1, d1);
  const __m128i sum = _mm_add_epi32(e0, e1);

  int32_t tmp[4];
  _mm_storeu_si128((__m128i*)tmp, sum);
  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
}
Beispiel #2
0
Datei: add.c Projekt: 8l/rsp
static INLINE void SIGNED_CLAMP_SUB(pi16 VD, pi16 VS, pi16 VT)
{
    v16 dst, src, vco;
    v16 dif, res, xmm;

    src = _mm_load_si128((v16 *)VS);
    dst = _mm_load_si128((v16 *)VT);
    vco = _mm_load_si128((v16 *)cf_co);

    res = _mm_subs_epi16(src, dst);

/*
 * Due to premature clamps in-between subtracting two of the three operands,
 * we must be careful not to offset the result accidentally when subtracting
 * the corresponding VCO flag AFTER the saturation from doing (VS - VT).
 */
    dif = _mm_add_epi16(res, vco);
    dif = _mm_xor_si128(dif, res); /* Adding one suddenly inverts the sign? */
    dif = _mm_and_si128(dif, dst); /* Sign change due to subtracting a neg. */
    xmm = _mm_sub_epi16(src, dst);
    src = _mm_andnot_si128(src, dif); /* VS must be >= 0x0000 for overflow. */
    xmm = _mm_and_si128(xmm, src); /* VS + VT != INT16_MIN; VS + VT >= +32768 */
    xmm = _mm_srli_epi16(xmm, 15); /* src = (INT16_MAX + 1 === INT16_MIN) ? */

    xmm = _mm_andnot_si128(xmm, vco); /* If it's NOT overflow, keep flag. */
    res = _mm_subs_epi16(res, xmm);
    _mm_store_si128((v16 *)VD, res);
    return;
}
Beispiel #3
0
static int SSE8x8(const uint8_t* a, const uint8_t* b) {
  const __m128i zero = _mm_setzero_si128();
  int num_pairs = 4;
  __m128i sum = zero;
  int32_t tmp[4];
  while (num_pairs-- > 0) {
    const __m128i a0 = LOAD_8x16b(&a[BPS * 0]);
    const __m128i a1 = LOAD_8x16b(&a[BPS * 1]);
    const __m128i b0 = LOAD_8x16b(&b[BPS * 0]);
    const __m128i b1 = LOAD_8x16b(&b[BPS * 1]);
    // subtract
    const __m128i c0 = _mm_subs_epi16(a0, b0);
    const __m128i c1 = _mm_subs_epi16(a1, b1);
    // multiply/accumulate with self
    const __m128i d0 = _mm_madd_epi16(c0, c0);
    const __m128i d1 = _mm_madd_epi16(c1, c1);
    // collect
    const __m128i sum01 = _mm_add_epi32(d0, d1);
    sum = _mm_add_epi32(sum, sum01);
    a += 2 * BPS;
    b += 2 * BPS;
  }
  _mm_storeu_si128((__m128i*)tmp, sum);
  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
}
Beispiel #4
0
__m128i test_mm_subs_epi16(__m128i A, __m128i B) {
  // DAG-LABEL: test_mm_subs_epi16
  // DAG: call <8 x i16> @llvm.x86.sse2.psubs.w
  //
  // ASM-LABEL: test_mm_subs_epi16
  // ASM: psubsw
  return _mm_subs_epi16(A, B);
}
Beispiel #5
0
static WEBP_INLINE __m128i SubtractAndAccumulate(const __m128i a,
                                                 const __m128i b) {
  const __m128i zero = _mm_setzero_si128();
  // convert to 16b
  const __m128i A0 = _mm_unpacklo_epi8(a, zero);
  const __m128i B0 = _mm_unpacklo_epi8(b, zero);
  const __m128i A1 = _mm_unpackhi_epi8(a, zero);
  const __m128i B1 = _mm_unpackhi_epi8(b, zero);
  // subtract
  const __m128i C0 = _mm_subs_epi16(A0, B0);
  const __m128i C1 = _mm_subs_epi16(A1, B1);
  // multiply with self
  const __m128i D0 = _mm_madd_epi16(C0, C0);
  const __m128i D1 = _mm_madd_epi16(C1, C1);
  // accumulate
  const __m128i sum = _mm_add_epi32(D0, D1);
  return sum;
}
Beispiel #6
0
void ulsch_64qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
		     int **rxdataF_comp,
		     short *ulsch_llr,
		     int **ul_ch_mag,
		     int **ul_ch_magb,
		     unsigned char symbol,
		     unsigned short nb_rb) {

  __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
  __m128i *ch_mag,*ch_magb;
  int j=0,i;
  //  unsigned char symbol_mod;


  if (symbol == 0)
    llrU = ulsch_llr;

  //  symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;

  ch_mag =(__m128i*)&ul_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];
  ch_magb =(__m128i*)&ul_ch_magb[0][(symbol*frame_parms->N_RB_DL*12)];


  for (i=0;i<(nb_rb*3);i++) {


    mmtmpU1 = _mm_abs_epi16(rxF[i]);
    mmtmpU1  = _mm_subs_epi16(mmtmpU1,ch_mag[i]);
    mmtmpU2 = _mm_abs_epi16(mmtmpU1);
    mmtmpU2 = _mm_subs_epi16(mmtmpU2,ch_magb[i]);

    for (j=0;j<8;j++) {
      llrU[0] = ((short *)&rxF[i])[j];
      llrU[1] = ((short *)&mmtmpU1)[j];
      llrU[2] = ((short *)&mmtmpU2)[j];
      llrU+=3;
    }

  }

  _mm_empty();
  _m_empty();

}
    SIMDValue SIMDInt16x8Operation::OpSubSaturate(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);

        x86Result.m128i_value = _mm_subs_epi16(tmpaValue.m128i_value, tmpbValue.m128i_value); // a - b saturates

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
Beispiel #8
0
static void TransformAC3(const int16_t* in, uint8_t* dst) {
    static const int kC1 = 20091 + (1 << 16);
    static const int kC2 = 35468;
    const __m128i A = _mm_set1_epi16(in[0] + 4);
    const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2));
    const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1));
    const int c1 = MUL(in[1], kC2);
    const int d1 = MUL(in[1], kC1);
    const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1);
    const __m128i B = _mm_adds_epi16(A, CD);
    const __m128i m0 = _mm_adds_epi16(B, d4);
    const __m128i m1 = _mm_adds_epi16(B, c4);
    const __m128i m2 = _mm_subs_epi16(B, c4);
    const __m128i m3 = _mm_subs_epi16(B, d4);
    const __m128i zero = _mm_setzero_si128();
    // Load the source pixels.
    __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
    __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
    __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
    __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
    // Convert to 16b.
    dst0 = _mm_unpacklo_epi8(dst0, zero);
    dst1 = _mm_unpacklo_epi8(dst1, zero);
    dst2 = _mm_unpacklo_epi8(dst2, zero);
    dst3 = _mm_unpacklo_epi8(dst3, zero);
    // Add the inverse transform.
    dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3));
    dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3));
    dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3));
    dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3));
    // Unsigned saturate to 8b.
    dst0 = _mm_packus_epi16(dst0, dst0);
    dst1 = _mm_packus_epi16(dst1, dst1);
    dst2 = _mm_packus_epi16(dst2, dst2);
    dst3 = _mm_packus_epi16(dst3, dst3);
    // Store the results.
    *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
    *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
    *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
    *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
}
Beispiel #9
0
__m64 _m_psubsw(__m64 _MM1, __m64 _MM2)
{
    __m128i lhs = {0}, rhs = {0};
    lhs.m128i_i64[0] = _MM1.m64_i64;

    rhs.m128i_i64[0] = _MM2.m64_i64;

    lhs = _mm_subs_epi16(lhs, rhs);

    _MM1.m64_i64 = lhs.m128i_i64[0];
    return _MM1;
}
Beispiel #10
0
void ulsch_16qam_llr(LTE_DL_FRAME_PARMS *frame_parms,
		     int **rxdataF_comp,
		     short *ulsch_llr,
		     int **ul_ch_mag,
		     unsigned char symbol,
		     unsigned short nb_rb) {

  __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
  __m128i *ch_mag;
  int i;
  //  unsigned char symbol_mod;

  //  printf("ulsch_rx.c: ulsch_16qam_llr: symbol %d\n",symbol);

  if (symbol == 0)
    llr128U = (__m128i*)&ulsch_llr[0];

  //  symbol_mod = (symbol>=(7-frame_parms->Ncp)) ? symbol-(7-frame_parms->Ncp) : symbol;

  ch_mag =(__m128i*)&ul_ch_mag[0][(symbol*frame_parms->N_RB_DL*12)];


  for (i=0;i<(nb_rb*3);i++) {


    mmtmpU0 = _mm_abs_epi16(rxF[i]);
    //    print_shorts("tmp0",&tmp0);

    mmtmpU0 = _mm_subs_epi16(mmtmpU0,ch_mag[i]);


    llr128U[0] = _mm_unpacklo_epi16(rxF[i],mmtmpU0);
    llr128U[1] = _mm_unpackhi_epi16(rxF[i],mmtmpU0);
    llr128U+=2;

    //    print_bytes("rxF[i]",&rxF[i]);
    //    print_bytes("rxF[i+1]",&rxF[i+1]);
  }

  _mm_empty();
  _m_empty();

}
Beispiel #11
0
		SIMD_INLINE __m128i AdjustedYuvToHue16(__m128i y, __m128i u, __m128i v, const __m128 & KF_255_DIV_6)
		{
			const __m128i red = AdjustedYuvToRed16(y, v);
			const __m128i green = AdjustedYuvToGreen16(y, u, v);
			const __m128i blue = AdjustedYuvToBlue16(y, u);
			const __m128i max = MaxI16(red, green, blue);
			const __m128i range = _mm_subs_epi16(max, MinI16(red, green, blue));

			const __m128i redMaxMask = _mm_cmpeq_epi16(red, max);
			const __m128i greenMaxMask = _mm_andnot_si128(redMaxMask, _mm_cmpeq_epi16(green, max));
			const __m128i blueMaxMask = _mm_andnot_si128(redMaxMask, _mm_andnot_si128(greenMaxMask, K_INV_ZERO));

			const __m128i redMaxCase = _mm_and_si128(redMaxMask, 
				_mm_add_epi16(_mm_sub_epi16(green, blue), _mm_mullo_epi16(range, K16_0006))); 
			const __m128i greenMaxCase = _mm_and_si128(greenMaxMask, 
				_mm_add_epi16(_mm_sub_epi16(blue, red), _mm_mullo_epi16(range, K16_0002))); 
			const __m128i blueMaxCase = _mm_and_si128(blueMaxMask, 
				_mm_add_epi16(_mm_sub_epi16(red, green), _mm_mullo_epi16(range, K16_0004))); 

			const __m128i dividend = _mm_or_si128(_mm_or_si128(redMaxCase, greenMaxCase), blueMaxCase);

			return _mm_andnot_si128(_mm_cmpeq_epi16(range, K_ZERO), _mm_and_si128(MulDiv16(dividend, range, KF_255_DIV_6), K16_00FF));
		}
void CColorAdjustment::ProcessY(int Width, int Height, uint8_t *pData, int Pitch)
{
	if (m_Brightness != 0 || m_Contrast != 0) {
		if (m_fUpdateYTable) {
			MakeYTable(m_YTable, m_Brightness, m_Contrast);
			m_fUpdateYTable = false;
		}

#ifdef TVTVIDEODEC_SSE2_SUPPORT
		const bool fSSE2 = IsSSE2Enabled();
#endif

		for (int y = 0; y < Height; y++) {
			uint8_t *p = pData;
			int x = 0;

#ifdef TVTVIDEODEC_SSE2_SUPPORT
			if (fSSE2 && !((uintptr_t)p & 15)) {
				const short c = (short)(min((m_Contrast * 512 / 100) + 512, (1 << 16) - 1));
				const short b = (short)((m_Brightness * 255 / 100) + 16);
				const __m128i bc = _mm_set_epi16(b, c, b, c, b, c, b, c);
				const __m128i zero = _mm_setzero_si128();
				const __m128i w16 = _mm_set1_epi16(16);
				const __m128i w512 = _mm_set1_epi16(512);

				for (; x + 16 <= Width; x += 16) {
					__m128i r = _mm_load_si128((const __m128i*)p);

					__m128i rl = _mm_unpacklo_epi8(r, zero);
					__m128i rh = _mm_unpackhi_epi8(r, zero);

					rl = _mm_subs_epi16(rl, w16);
					rh = _mm_subs_epi16(rh, w16);

					__m128i rll = _mm_unpacklo_epi16(rl, w512);
					__m128i rlh = _mm_unpackhi_epi16(rl, w512);
					__m128i rhl = _mm_unpacklo_epi16(rh, w512);
					__m128i rhh = _mm_unpackhi_epi16(rh, w512);

					rll = _mm_madd_epi16(rll, bc);
					rlh = _mm_madd_epi16(rlh, bc);
					rhl = _mm_madd_epi16(rhl, bc);
					rhh = _mm_madd_epi16(rhh, bc);

					rll = _mm_srai_epi32(rll, 9);
					rlh = _mm_srai_epi32(rlh, 9);
					rhl = _mm_srai_epi32(rhl, 9);
					rhh = _mm_srai_epi32(rhh, 9);

					rl = _mm_packs_epi32(rll, rlh);
					rh = _mm_packs_epi32(rhl, rhh);

					r = _mm_packus_epi16(rl, rh);

					_mm_store_si128((__m128i*)p, r);

					p += 16;
				}
			}
#endif

			for (; x < Width; x++) {
				*p = m_YTable[*p];
				p++;
			}

			pData += Pitch;
		}
	}
}
Beispiel #13
0
pstatus_t sse2_alphaComp_argb(
    const BYTE* pSrc1,  UINT32 src1Step,
    const BYTE* pSrc2,  UINT32 src2Step,
    BYTE* pDst,  UINT32 dstStep,
    UINT32 width,  UINT32 height)
{
	const UINT32* sptr1 = (const UINT32*) pSrc1;
	const UINT32* sptr2 = (const UINT32*) pSrc2;
	UINT32* dptr;
	int linebytes, src1Jump, src2Jump, dstJump;
	UINT32 y;
	__m128i xmm0, xmm1;

	if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS;

	if (width < 4)     /* pointless if too small */
	{
		return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step,
					       pDst, dstStep, width, height);
	}

	dptr = (UINT32*) pDst;
	linebytes = width * sizeof(UINT32);
	src1Jump = (src1Step - linebytes) / sizeof(UINT32);
	src2Jump = (src2Step - linebytes) / sizeof(UINT32);
	dstJump  = (dstStep  - linebytes) / sizeof(UINT32);
	xmm0 = _mm_set1_epi32(0);
	xmm1 = _mm_set1_epi16(1);

	for (y = 0; y < height; ++y)
	{
		int pixels = width;
		int count;
		/* Get to the 16-byte boundary now. */
		int leadIn = 0;

		switch ((ULONG_PTR) dptr & 0x0f)
		{
			case 0:
				leadIn = 0;
				break;

			case 4:
				leadIn = 3;
				break;

			case 8:
				leadIn = 2;
				break;

			case 12:
				leadIn = 1;
				break;

			default:
				/* We'll never hit a 16-byte boundary, so do the whole
				 * thing the slow way.
				 */
				leadIn = width;
				break;
		}

		if (leadIn)
		{
			pstatus_t status;
			status = generic->alphaComp_argb((const BYTE*) sptr1,
						src1Step, (const BYTE*) sptr2, src2Step,
						(BYTE*) dptr, dstStep, leadIn, 1);
			if (status != PRIMITIVES_SUCCESS)
				return status;

			sptr1 += leadIn;
			sptr2 += leadIn;
			dptr  += leadIn;
			pixels -= leadIn;
		}

		/* Use SSE registers to do 4 pixels at a time. */
		count = pixels >> 2;
		pixels -= count << 2;

		while (count--)
		{
			__m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
			/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
			xmm2 = LOAD_SI128(sptr1);
			sptr1 += 4;
			/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
			xmm3 = LOAD_SI128(sptr2);
			sptr2 += 4;
			/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
			xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
			/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
			xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
			/* subtract */
			xmm6 = _mm_subs_epi16(xmm4, xmm5);
			/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
			xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
			/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
			xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
			/* Add one to alphas */
			xmm4 = _mm_adds_epi16(xmm4, xmm1);
			/* Multiply and take low word */
			xmm4 = _mm_mullo_epi16(xmm4, xmm6);
			/* Shift 8 right */
			xmm4 = _mm_srai_epi16(xmm4, 8);
			/* Add xmm5 */
			xmm4 = _mm_adds_epi16(xmm4, xmm5);
			/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
			/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
			xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
			/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
			xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
			/* subtract */
			xmm7 = _mm_subs_epi16(xmm5, xmm6);
			/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
			xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
			/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
			xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
			/* Add one to alphas */
			xmm5 = _mm_adds_epi16(xmm5, xmm1);
			/* Multiply and take low word */
			xmm5 = _mm_mullo_epi16(xmm5, xmm7);
			/* Shift 8 right */
			xmm5 = _mm_srai_epi16(xmm5, 8);
			/* Add xmm6 */
			xmm5 = _mm_adds_epi16(xmm5, xmm6);
			/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
			/* Must mask off remainders or pack gets confused */
			xmm3 = _mm_set1_epi16(0x00ffU);
			xmm4 = _mm_and_si128(xmm4, xmm3);
			xmm5 = _mm_and_si128(xmm5, xmm3);
			/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
			xmm5 = _mm_packus_epi16(xmm5, xmm4);
			_mm_store_si128((__m128i*) dptr, xmm5);
			dptr += 4;
		}

		/* Finish off the remainder. */
		if (pixels)
		{
			pstatus_t status;
			status = generic->alphaComp_argb((const BYTE*) sptr1, src1Step,
						(const BYTE*) sptr2, src2Step,
						(BYTE*) dptr, dstStep, pixels, 1);
			if (status != PRIMITIVES_SUCCESS)
				return status;

			sptr1 += pixels;
			sptr2 += pixels;
			dptr  += pixels;
		}

		/* Jump to next row. */
		sptr1 += src1Jump;
		sptr2 += src2Jump;
		dptr  += dstJump;
	}

	return PRIMITIVES_SUCCESS;
}
Beispiel #14
0
pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
		BYTE *pDst, int dstStep, const prim_size_t *roi)
{
	int lastRow, lastCol;
	BYTE *UData,*VData,*YData;
	int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV;
	__m128i r0,r1,r2,r3,r4,r5,r6,r7;
	__m128i *buffer;
	
	/* last_line: if the last (U,V doubled) line should be skipped, set to 10B
	 * last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */

	buffer = _aligned_malloc(4 * 16, 16);
	
	YData = (BYTE*) pSrc[0];
	UData = (BYTE*) pSrc[1];
	VData = (BYTE*) pSrc[2];
	
	nWidth = roi->width;
	nHeight = roi->height;
	
	if ((lastCol = (nWidth & 3)))
	{
		switch (lastCol)
		{
			case 1:
				r7 = _mm_set_epi32(0,0,0,0xFFFFFFFF);
				break;

			case 2:
				r7 = _mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF);
				break;

			case 3:
				r7 = _mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF);
				break;
		}

		_mm_store_si128(buffer+3,r7);
		lastCol = 1;
	}
	
	nWidth += 3;
	nWidth = nWidth >> 2;
	
	lastRow = nHeight & 1;
	nHeight++;
	nHeight = nHeight >> 1;
	
	VaddDst = (dstStep << 1) - (nWidth << 4);
	VaddY = (srcStep[0] << 1) - (nWidth << 2);
	VaddU = srcStep[1] - (((nWidth << 1) + 2) & 0xFFFC);
	VaddV = srcStep[2] - (((nWidth << 1) + 2) & 0xFFFC);
	
	while (nHeight-- > 0)
	{
		if (nHeight == 0)
			lastRow <<= 1;

		i = 0;
		
		do
		{
			if (!(i & 0x01))
			{
			/* Y-, U- and V-data is stored in different arrays.
			* We start with processing U-data.
			*
			* at first we fetch four U-values from its array and shuffle them like this:
			*	0d0d 0c0c 0b0b 0a0a
			* we've done two things: converting the values to signed words and duplicating
			* each value, because always two pixel "share" the same U- (and V-) data */
				r0 = _mm_cvtsi32_si128(*(UINT32 *)UData);
				r5 = _mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000);
				r0 = _mm_shuffle_epi8(r0,r5);
				
				UData += 4;
				
			/* then we subtract 128 from each value, so we get D */
				r3 = _mm_set_epi16(128,128,128,128,128,128,128,128);
				r0 = _mm_subs_epi16(r0,r3);
				
			/* we need to do two things with our D, so let's store it for later use */
				r2 = r0;
				
			/* now we can multiply our D with 48 and unpack it to xmm4:xmm0
			 * this is what we need to get G data later on */
				r4 = r0;
				r7 = _mm_set_epi16(48,48,48,48,48,48,48,48);
				r0 = _mm_mullo_epi16(r0,r7);
				r4 = _mm_mulhi_epi16(r4,r7);
				r7 = r0;
				r0 = _mm_unpacklo_epi16(r0,r4);
				r4 = _mm_unpackhi_epi16(r7,r4);
				
			/* to get B data, we need to prepare a second value, D*475 */
				r1 = r2;
				r7 = _mm_set_epi16(475,475,475,475,475,475,475,475);
				r1 = _mm_mullo_epi16(r1,r7);
				r2 = _mm_mulhi_epi16(r2,r7);
				r7 = r1;
				r1 = _mm_unpacklo_epi16(r1,r2);
				r7 = _mm_unpackhi_epi16(r7,r2);
				
			/* so we got something like this: xmm7:xmm1
			 * this pair contains values for 16 pixel:
			 * aabbccdd
			 * aabbccdd, but we can only work on four pixel at once, so we need to save upper values */
				_mm_store_si128(buffer+1,r7);
				
			/* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */
				r2 = _mm_cvtsi32_si128(*(UINT32 *)VData);
				r2 = _mm_shuffle_epi8(r2,r5);
				
				VData += 4;
				
				r2 = _mm_subs_epi16(r2,r3);
				
				r5 = r2;
				
			/* this is also known as E*403, we need it to convert R data */
				r3 = r2;
				r7 = _mm_set_epi16(403,403,403,403,403,403,403,403);
				r2 = _mm_mullo_epi16(r2,r7);
				r3 = _mm_mulhi_epi16(r3,r7);
				r7 = r2;
				r2 = _mm_unpacklo_epi16(r2,r3);
				r7 = _mm_unpackhi_epi16(r7,r3);
				
			/* and preserve upper four values for future ... */
				_mm_store_si128(buffer+2,r7);
				
			/* doing this step: E*120 */
				r3 = r5;
				r7 = _mm_set_epi16(120,120,120,120,120,120,120,120);
				r3 = _mm_mullo_epi16(r3,r7);
				r5 = _mm_mulhi_epi16(r5,r7);
				r7 = r3;
				r3 = _mm_unpacklo_epi16(r3,r5);
				r7 = _mm_unpackhi_epi16(r7,r5);
				
			/* now we complete what we've begun above:
			 * (48*D) + (120*E) = (48*D +120*E) */
				r0 = _mm_add_epi32(r0,r3);
				r4 = _mm_add_epi32(r4,r7);
				
			/* and store to memory ! */
				_mm_store_si128(buffer,r4);
			}
			else
			{
			/* maybe you've wondered about the conditional above ?
			 * Well, we prepared UV data for eight pixel in each line, but can only process four
			 * per loop. So we need to load the upper four pixel data from memory each secound loop! */
				r1 = _mm_load_si128(buffer+1);
				r2 = _mm_load_si128(buffer+2);
				r0 = _mm_load_si128(buffer);
			}
			
			if (++i == nWidth)
				lastCol <<= 1;
			
		/* We didn't produce any output yet, so let's do so!
		 * Ok, fetch four pixel from the Y-data array and shuffle them like this:
		 * 00d0 00c0 00b0 00a0, to get signed dwords and multiply by 256 */
			r4 = _mm_cvtsi32_si128(*(UINT32 *)YData);
			r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080);
			r4 = _mm_shuffle_epi8(r4,r7);
			
			r5 = r4;
			r6 = r4;
			
		/* no we can perform the "real" conversion itself and produce output! */
			r4 = _mm_add_epi32(r4,r2);
			r5 = _mm_sub_epi32(r5,r0);
			r6 = _mm_add_epi32(r6,r1);
			
		/* in the end, we only need bytes for RGB values.
		 * So, what do we do? right! shifting left makes values bigger and thats always good.
		 * before we had dwords of data, and by shifting left and treating the result
		 * as packed words, we get not only signed words, but do also divide by 256
		 * imagine, data is now ordered this way: ddx0 ccx0 bbx0 aax0, and x is the least
		 * significant byte, that we don't need anymore, because we've done some rounding */
			r4 = _mm_slli_epi32(r4,8);
			r5 = _mm_slli_epi32(r5,8);
			r6 = _mm_slli_epi32(r6,8);
			
		/* one thing we still have to face is the clip() function ...
		 * we have still signed words, and there are those min/max instructions in SSE2 ...
		 * the max instruction takes always the bigger of the two operands and stores it in the first one,
		 * and it operates with signs !
		 * if we feed it with our values and zeros, it takes the zeros if our values are smaller than
		 * zero and otherwise our values */
			r7 = _mm_set_epi32(0,0,0,0);
			r4 = _mm_max_epi16(r4,r7);
			r5 = _mm_max_epi16(r5,r7);
			r6 = _mm_max_epi16(r6,r7);
			
		/* the same thing just completely different can be used to limit our values to 255,
		 * but now using the min instruction and 255s */
			r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
			r4 = _mm_min_epi16(r4,r7);
			r5 = _mm_min_epi16(r5,r7);
			r6 = _mm_min_epi16(r6,r7);
			
		/* Now we got our bytes.
		 * the moment has come to assemble the three channels R,G and B to the xrgb dwords
		 * on Red channel we just have to and each futural dword with 00FF0000H */
			//r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
			r4 = _mm_and_si128(r4,r7);
			
		/* on Green channel we have to shuffle somehow, so we get something like this:
		 * 00d0 00c0 00b0 00a0 */
			r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280);
			r5 = _mm_shuffle_epi8(r5,r7);
			
		/* and on Blue channel that one:
		 * 000d 000c 000b 000a */
			r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002);
			r6 = _mm_shuffle_epi8(r6,r7);
			
		/* and at last we or it together and get this one:
		 * xrgb xrgb xrgb xrgb */
			r4 = _mm_or_si128(r4,r5);
			r4 = _mm_or_si128(r4,r6);
			
		/* Only thing to do know is writing data to memory, but this gets a bit more
		 * complicated if the width is not a multiple of four and it is the last column in line. */
			if (lastCol & 0x02)
			{
			/* let's say, we need to only convert six pixel in width
			 * Ok, the first 4 pixel will be converted just like every 4 pixel else, but
			 * if it's the last loop in line, last_column is shifted left by one (curious? have a look above),
			 * and we land here. Through initialisation a mask was prepared. In this case it looks like
			 * 0000FFFFH 0000FFFFH 0000FFFFH 0000FFFFH */
				r6 = _mm_load_si128(buffer+3);
			/* we and our output data with this mask to get only the valid pixel */
				r4 = _mm_and_si128(r4,r6);
			/* then we fetch memory from the destination array ... */
				r5 = _mm_lddqu_si128((__m128i *)pDst);
			/* ... and and it with the inverse mask. We get only those pixel, which should not be updated */
				r6 = _mm_andnot_si128(r6,r5);
			/* we only have to or the two values together and write it back to the destination array,
			 * and only the pixel that should be updated really get changed. */
				r4 = _mm_or_si128(r4,r6);
			}
			_mm_storeu_si128((__m128i *)pDst,r4);
			
			if (!(lastRow & 0x02))
			{
			/* Because UV data is the same for two lines, we can process the secound line just here,
			 * in the same loop. Only thing we need to do is to add some offsets to the Y- and destination
			 * pointer. These offsets are iStride[0] and the target scanline.
			 * But if we don't need to process the secound line, like if we are in the last line of processing nine lines,
			 * we just skip all this. */
				r4 = _mm_cvtsi32_si128(*(UINT32 *)(YData+srcStep[0]));
				r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080);
				r4 = _mm_shuffle_epi8(r4,r7);
				
				r5 = r4;
				r6 = r4;
				
				r4 = _mm_add_epi32(r4,r2);
				r5 = _mm_sub_epi32(r5,r0);
				r6 = _mm_add_epi32(r6,r1);
				
				r4 = _mm_slli_epi32(r4,8);
				r5 = _mm_slli_epi32(r5,8);
				r6 = _mm_slli_epi32(r6,8);
				
				r7 = _mm_set_epi32(0,0,0,0);
				r4 = _mm_max_epi16(r4,r7);
				r5 = _mm_max_epi16(r5,r7);
				r6 = _mm_max_epi16(r6,r7);
				
				r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
				r4 = _mm_min_epi16(r4,r7);
				r5 = _mm_min_epi16(r5,r7);
				r6 = _mm_min_epi16(r6,r7);
				
				r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
				r4 = _mm_and_si128(r4,r7);
				
				r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280);
				r5 = _mm_shuffle_epi8(r5,r7);
				
				r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002);
				r6 = _mm_shuffle_epi8(r6,r7);
				
				r4 = _mm_or_si128(r4,r5);
				r4 = _mm_or_si128(r4,r6);
				
				if (lastCol & 0x02)
				{
					r6 = _mm_load_si128(buffer+3);
					r4 = _mm_and_si128(r4,r6);
					r5 = _mm_lddqu_si128((__m128i *)(pDst+dstStep));
					r6 = _mm_andnot_si128(r6,r5);
					r4 = _mm_or_si128(r4,r6);
					
				/* only thing is, we should shift [rbp-42] back here, because we have processed the last column,
				 * and this "special condition" can be released */
					lastCol >>= 1;
				}
				_mm_storeu_si128((__m128i *)(pDst+dstStep),r4);
			}
			
		/* after all we have to increase the destination- and Y-data pointer by four pixel */
			pDst += 16;
			YData += 4;
		}
Beispiel #15
0
void SGMStereo::calcRowCosts(unsigned char*& leftSobelRow, int*& leftCensusRow,
							 unsigned char*& rightSobelRow, int*& rightCensusRow,
							 unsigned short* costImageRow)
{
	const int widthStepCost = width_*disparityTotal_;
	const __m128i registerZero = _mm_setzero_si128();

	for (int y = 1; y < height_; ++y) {
		int addRowIndex = y + aggregationWindowRadius_;
		int addRowAggregatedCostIndex = std::min(addRowIndex, height_ - 1)%(aggregationWindowRadius_*2 + 2);
		unsigned short* addRowAggregatedCost = rowAggregatedCost_ + width_*disparityTotal_*addRowAggregatedCostIndex;

		if (addRowIndex < height_) {
			calcPixelwiseSAD(leftSobelRow, rightSobelRow);
			addPixelwiseHamming(leftCensusRow, rightCensusRow);

			memset(addRowAggregatedCost, 0, disparityTotal_*sizeof(unsigned short));
			// x = 0
			for (int x = 0; x <= aggregationWindowRadius_; ++x) {
				int scale = x == 0 ? aggregationWindowRadius_ + 1 : 1;
				for (int d = 0; d < disparityTotal_; ++d) {
					addRowAggregatedCost[d] += static_cast<unsigned short>(pixelwiseCostRow_[disparityTotal_*x + d]*scale);
				}
			}
			// x = 1...width-1
			int subRowAggregatedCostIndex = std::max(y - aggregationWindowRadius_ - 1, 0)%(aggregationWindowRadius_*2 + 2);
			const unsigned short* subRowAggregatedCost = rowAggregatedCost_ + width_*disparityTotal_*subRowAggregatedCostIndex;
			const unsigned short* previousCostRow = costImageRow - widthStepCost;
			for (int x = 1; x < width_; ++x) {
				const unsigned char* addPixelwiseCost = pixelwiseCostRow_
					+ std::min((x + aggregationWindowRadius_)*disparityTotal_, (width_ - 1)*disparityTotal_);
				const unsigned char* subPixelwiseCost = pixelwiseCostRow_
					+ std::max((x - aggregationWindowRadius_ - 1)*disparityTotal_, 0);

				for (int d = 0; d < disparityTotal_; d += 16) {
					__m128i registerAddPixelwiseLow = _mm_load_si128(reinterpret_cast<const __m128i*>(addPixelwiseCost + d));
					__m128i registerAddPixelwiseHigh = _mm_unpackhi_epi8(registerAddPixelwiseLow, registerZero);
					registerAddPixelwiseLow = _mm_unpacklo_epi8(registerAddPixelwiseLow, registerZero);
					__m128i registerSubPixelwiseLow = _mm_load_si128(reinterpret_cast<const __m128i*>(subPixelwiseCost + d));
					__m128i registerSubPixelwiseHigh = _mm_unpackhi_epi8(registerSubPixelwiseLow, registerZero);
					registerSubPixelwiseLow = _mm_unpacklo_epi8(registerSubPixelwiseLow, registerZero);

					// Low
					__m128i registerAddAggregated = _mm_load_si128(reinterpret_cast<const __m128i*>(addRowAggregatedCost
						+ disparityTotal_*(x - 1) + d));
					registerAddAggregated = _mm_adds_epi16(_mm_subs_epi16(registerAddAggregated, registerSubPixelwiseLow),
														   registerAddPixelwiseLow);
					__m128i registerCost = _mm_load_si128(reinterpret_cast<const __m128i*>(previousCostRow + disparityTotal_*x + d));
					registerCost = _mm_adds_epi16(_mm_subs_epi16(registerCost,
						_mm_load_si128(reinterpret_cast<const __m128i*>(subRowAggregatedCost + disparityTotal_*x + d))),
						registerAddAggregated);
					_mm_store_si128(reinterpret_cast<__m128i*>(addRowAggregatedCost + disparityTotal_*x + d), registerAddAggregated);
					_mm_store_si128(reinterpret_cast<__m128i*>(costImageRow + disparityTotal_*x + d), registerCost);

					// High
					registerAddAggregated = _mm_load_si128(reinterpret_cast<const __m128i*>(addRowAggregatedCost + disparityTotal_*(x-1) + d + 8));
					registerAddAggregated = _mm_adds_epi16(_mm_subs_epi16(registerAddAggregated, registerSubPixelwiseHigh),
														   registerAddPixelwiseHigh);
					registerCost = _mm_load_si128(reinterpret_cast<const __m128i*>(previousCostRow + disparityTotal_*x + d + 8));
					registerCost = _mm_adds_epi16(_mm_subs_epi16(registerCost,
						_mm_load_si128(reinterpret_cast<const __m128i*>(subRowAggregatedCost + disparityTotal_*x + d + 8))),
						registerAddAggregated);
					_mm_store_si128(reinterpret_cast<__m128i*>(addRowAggregatedCost + disparityTotal_*x + d + 8), registerAddAggregated);
					_mm_store_si128(reinterpret_cast<__m128i*>(costImageRow + disparityTotal_*x + d + 8), registerCost);
				}
			}
		}

		leftSobelRow += widthStep_;
		rightSobelRow += widthStep_;
		leftCensusRow += width_;
		rightCensusRow += width_;
		costImageRow += widthStepCost;
	}
}
int
smith_waterman_sse2_word(const unsigned char *     query_sequence,
                         unsigned short *    query_profile_word,
                         const int                 query_length,
                         const unsigned char *     db_sequence,
                         const int                 db_length,
                         unsigned short      gap_open,
                         unsigned short      gap_extend,
                         struct f_struct *   f_str)
{
    int     i, j, k;
    short   score;

    int     cmp;
    int     iter = (query_length + 7) / 8;
    

    __m128i *p;
    __m128i *workspace = (__m128i *) f_str->workspace;

    __m128i E, F, H;

    __m128i v_maxscore;
    __m128i v_gapopen;
    __m128i v_gapextend;

    __m128i v_min;
    __m128i v_minimums;
    __m128i v_temp;

    __m128i *pHLoad, *pHStore;
    __m128i *pE;

    __m128i *pScore;

    /* Load gap opening penalty to all elements of a constant */
    v_gapopen = _mm_setzero_si128();	/* Apple Devel */
    v_gapopen = _mm_insert_epi16 (v_gapopen, gap_open, 0);
    v_gapopen = _mm_shufflelo_epi16 (v_gapopen, 0);
    v_gapopen = _mm_shuffle_epi32 (v_gapopen, 0);

    /* Load gap extension penalty to all elements of a constant */
    v_gapextend = _mm_setzero_si128();	/* Apple Devel */
    v_gapextend = _mm_insert_epi16 (v_gapextend, gap_extend, 0);
    v_gapextend = _mm_shufflelo_epi16 (v_gapextend, 0);
    v_gapextend = _mm_shuffle_epi32 (v_gapextend, 0);

    /* load v_maxscore with the zeros.  since we are using signed */
    /*  math, we will bias the maxscore to -32768 so we have the */
    /*  full range of the short. */
    v_maxscore = _mm_setzero_si128();	/* Apple Devel */
    v_maxscore = _mm_cmpeq_epi16 (v_maxscore, v_maxscore);
    v_maxscore = _mm_slli_epi16 (v_maxscore, 15);

    v_minimums = _mm_shuffle_epi32 (v_maxscore, 0);

    v_min = _mm_shuffle_epi32 (v_maxscore, 0);
    v_min = _mm_srli_si128 (v_min, 14);

    /* Zero out the storage vector */
    k = 2 * iter;

    p = workspace;
    for (i = 0; i < k; i++)
    {
        _mm_store_si128 (p++, v_maxscore);
    }

    pE = workspace;
    pHStore = pE + iter;
    pHLoad = pHStore + iter;

    for (i = 0; i < db_length; ++i)
    {
        /* fetch first data asap. */
        pScore = (__m128i *) query_profile_word + db_sequence[i] * iter;

        /* bias all elements in F to -32768 */
        F = _mm_setzero_si128();	/* Apple Devel */
        F = _mm_cmpeq_epi16 (F, F);
        F = _mm_slli_epi16 (F, 15);

        /* load the next h value */
        H = _mm_load_si128 (pHStore + iter - 1);
        H = _mm_slli_si128 (H, 2);
        H = _mm_or_si128 (H, v_min);

        p = pHLoad;
        pHLoad = pHStore;
        pHStore = p;

        for (j = 0; j < iter; j++)
        {
            /* load E values */
            E = _mm_load_si128 (pE + j);

            /* add score to H */
            H = _mm_adds_epi16 (H, *pScore++);

            /* Update highest score encountered this far */
            v_maxscore = _mm_max_epi16 (v_maxscore, H);

            /* get max from H, E and F */
            H = _mm_max_epi16 (H, E);
            H = _mm_max_epi16 (H, F);

            /* save H values */
            _mm_store_si128 (pHStore + j, H);

            /* subtract the gap open penalty from H */
            H = _mm_subs_epi16 (H, v_gapopen);

            /* update E value */
            E = _mm_subs_epi16 (E, v_gapextend);
            E = _mm_max_epi16 (E, H);

            /* update F value */
            F = _mm_subs_epi16 (F, v_gapextend);
            F = _mm_max_epi16 (F, H);

            /* save E values */
            _mm_store_si128 (pE + j, E);

            /* load the next h value */
            H = _mm_load_si128 (pHLoad + j);
        }

        /* reset pointers to the start of the saved data */
        j = 0;
        H = _mm_load_si128 (pHStore + j);

        /*  the computed F value is for the given column.  since */
        /*  we are at the end, we need to shift the F value over */
        /*  to the next column. */
        F = _mm_slli_si128 (F, 2);
        F = _mm_or_si128 (F, v_min);
        v_temp = _mm_subs_epi16 (H, v_gapopen);
        v_temp = _mm_cmpgt_epi16 (F, v_temp);
        cmp  = _mm_movemask_epi8 (v_temp);

        while (cmp != 0x0000) 
        {
            E = _mm_load_si128 (pE + j);

            H = _mm_max_epi16 (H, F);

            /* save H values */
            _mm_store_si128 (pHStore + j, H);

            /* update E in case the new H value would change it */
            H = _mm_subs_epi16 (H, v_gapopen);
            E = _mm_max_epi16 (E, H);
            _mm_store_si128 (pE + j, E);

            /* update F value */
            F = _mm_subs_epi16 (F, v_gapextend);

            j++;
            if (j >= iter)
            {
                j = 0;
                F = _mm_slli_si128 (F, 2);
                F = _mm_or_si128 (F, v_min);
            }
            H = _mm_load_si128 (pHStore + j);

            v_temp = _mm_subs_epi16 (H, v_gapopen);
            v_temp = _mm_cmpgt_epi16 (F, v_temp);
            cmp  = _mm_movemask_epi8 (v_temp);
        }
    }

    /* find largest score in the v_maxscore vector */
    v_temp = _mm_srli_si128 (v_maxscore, 8);
    v_maxscore = _mm_max_epi16 (v_maxscore, v_temp);
    v_temp = _mm_srli_si128 (v_maxscore, 4);
    v_maxscore = _mm_max_epi16 (v_maxscore, v_temp);
    v_temp = _mm_srli_si128 (v_maxscore, 2);
    v_maxscore = _mm_max_epi16 (v_maxscore, v_temp);

    /* extract the largest score */
    score = _mm_extract_epi16 (v_maxscore, 0);

    /* return largest score biased by 32768 */

    /* fix for Mac OSX clang 4.1 */ 
    /*
#ifdef __clang__
    if (score < 0) score += 32768;
    return score;
#else
    */
    return score + 32768;
    /* #endif */
}
Beispiel #17
0
void SGMStereo::performSGM(unsigned short* costImage, unsigned short* disparityImage) {
	const short costMax = SHRT_MAX;

	int widthStepCostImage = width_*disparityTotal_;

	short* costSums = sgmBuffer_;
	memset(costSums, 0, costSumBufferSize_*sizeof(short));

	short** pathCosts = new short*[pathRowBufferTotal_];
	short** pathMinCosts = new short*[pathRowBufferTotal_];

	const int processPassTotal = 2;
	for (int processPassCount = 0; processPassCount < processPassTotal; ++processPassCount) {
		int startX, endX, stepX;
		int startY, endY, stepY;
		if (processPassCount == 0) {
			startX = 0; endX = width_; stepX = 1;
			startY = 0; endY = height_; stepY = 1;
		} else {
			startX = width_ - 1; endX = -1; stepX = -1;
			startY = height_ - 1; endY = -1; stepY = -1;
		}

		for (int i = 0; i < pathRowBufferTotal_; ++i) {
			pathCosts[i] = costSums + costSumBufferSize_ + pathCostBufferSize_*i + pathDisparitySize_ + 8;
			memset(pathCosts[i] - pathDisparitySize_ - 8, 0, pathCostBufferSize_*sizeof(short));
			pathMinCosts[i] = costSums + costSumBufferSize_ + pathCostBufferSize_*pathRowBufferTotal_
				+ pathMinCostBufferSize_*i + pathTotal_*2;
			memset(pathMinCosts[i] - pathTotal_, 0, pathMinCostBufferSize_*sizeof(short));
		}

		for (int y = startY; y != endY; y += stepY) {
			unsigned short* pixelCostRow = costImage + widthStepCostImage*y;
			short* costSumRow = costSums + costSumBufferRowSize_*y;

			memset(pathCosts[0] - pathDisparitySize_ - 8, 0, pathDisparitySize_*sizeof(short));
			memset(pathCosts[0] + width_*pathDisparitySize_ - 8, 0, pathDisparitySize_*sizeof(short));
			memset(pathMinCosts[0] - pathTotal_, 0, pathTotal_*sizeof(short));
			memset(pathMinCosts[0] + width_*pathTotal_, 0, pathTotal_*sizeof(short));

			for (int x = startX; x != endX; x += stepX) {
				int pathMinX = x*pathTotal_;
				int pathX = pathMinX*disparitySize_;

				int previousPathMin0 = pathMinCosts[0][pathMinX - stepX*pathTotal_] + smoothnessPenaltyLarge_;
				int previousPathMin2 = pathMinCosts[1][pathMinX + 2] + smoothnessPenaltyLarge_;

				short* previousPathCosts0 = pathCosts[0] + pathX - stepX*pathDisparitySize_;
				short* previousPathCosts2 = pathCosts[1] + pathX + disparitySize_*2;

				previousPathCosts0[-1] = previousPathCosts0[disparityTotal_] = costMax;
				previousPathCosts2[-1] = previousPathCosts2[disparityTotal_] = costMax;

				short* pathCostCurrent = pathCosts[0] + pathX;
				const unsigned short* pixelCostCurrent = pixelCostRow + disparityTotal_*x;
				short* costSumCurrent = costSumRow + disparityTotal_*x;

				__m128i regPenaltySmall = _mm_set1_epi16(static_cast<short>(smoothnessPenaltySmall_));

				__m128i regPathMin0, regPathMin2;
				regPathMin0 = _mm_set1_epi16(static_cast<short>(previousPathMin0));
				regPathMin2 = _mm_set1_epi16(static_cast<short>(previousPathMin2));
				__m128i regNewPathMin = _mm_set1_epi16(costMax);

				for (int d = 0; d < disparityTotal_; d += 8) {
					__m128i regPixelCost = _mm_load_si128(reinterpret_cast<const __m128i*>(pixelCostCurrent + d));

					__m128i regPathCost0, regPathCost2;
					regPathCost0 = _mm_load_si128(reinterpret_cast<const __m128i*>(previousPathCosts0 + d));
					regPathCost2 = _mm_load_si128(reinterpret_cast<const __m128i*>(previousPathCosts2 + d));

					regPathCost0 = _mm_min_epi16(regPathCost0,
												 _mm_adds_epi16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(previousPathCosts0 + d - 1)),
												 regPenaltySmall));
					regPathCost0 = _mm_min_epi16(regPathCost0,
												 _mm_adds_epi16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(previousPathCosts0 + d + 1)),
												 regPenaltySmall));
					regPathCost2 = _mm_min_epi16(regPathCost2,
												 _mm_adds_epi16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(previousPathCosts2 + d - 1)),
												 regPenaltySmall));
					regPathCost2 = _mm_min_epi16(regPathCost2,
												 _mm_adds_epi16(_mm_loadu_si128(reinterpret_cast<const __m128i*>(previousPathCosts2 + d + 1)),
												 regPenaltySmall));

					regPathCost0 = _mm_min_epi16(regPathCost0, regPathMin0);
					regPathCost0 = _mm_adds_epi16(_mm_subs_epi16(regPathCost0, regPathMin0), regPixelCost);
					regPathCost2 = _mm_min_epi16(regPathCost2, regPathMin2);
					regPathCost2 = _mm_adds_epi16(_mm_subs_epi16(regPathCost2, regPathMin2), regPixelCost);

					_mm_store_si128(reinterpret_cast<__m128i*>(pathCostCurrent + d), regPathCost0);
					_mm_store_si128(reinterpret_cast<__m128i*>(pathCostCurrent + d + disparitySize_*2), regPathCost2);

					__m128i regMin02 = _mm_min_epi16(_mm_unpacklo_epi16(regPathCost0, regPathCost2),
													 _mm_unpackhi_epi16(regPathCost0, regPathCost2));
					regMin02 = _mm_min_epi16(_mm_unpacklo_epi16(regMin02, regMin02),
											 _mm_unpackhi_epi16(regMin02, regMin02));
					regNewPathMin = _mm_min_epi16(regNewPathMin, regMin02);

					__m128i regCostSum = _mm_load_si128(reinterpret_cast<const __m128i*>(costSumCurrent + d));

					regCostSum = _mm_adds_epi16(regCostSum, regPathCost0);
					regCostSum = _mm_adds_epi16(regCostSum, regPathCost2);

					_mm_store_si128(reinterpret_cast<__m128i*>(costSumCurrent + d), regCostSum);
				}

				regNewPathMin = _mm_min_epi16(regNewPathMin, _mm_srli_si128(regNewPathMin, 8));
				_mm_storel_epi64(reinterpret_cast<__m128i*>(&pathMinCosts[0][pathMinX]), regNewPathMin);
			}

			if (processPassCount == processPassTotal - 1) {
				unsigned short* disparityRow = disparityImage + width_*y;

				for (int x = 0; x < width_; ++x) {
					short* costSumCurrent = costSumRow + disparityTotal_*x;
					int bestSumCost = costSumCurrent[0];
					int bestDisparity = 0;
					for (int d = 1; d < disparityTotal_; ++d) {
						if (costSumCurrent[d] < bestSumCost) {
							bestSumCost = costSumCurrent[d];
							bestDisparity = d;
						}
					}

					if (bestDisparity > 0 && bestDisparity < disparityTotal_ - 1) {
						int centerCostValue = costSumCurrent[bestDisparity];
						int leftCostValue = costSumCurrent[bestDisparity - 1];
						int rightCostValue = costSumCurrent[bestDisparity + 1];
						if (rightCostValue < leftCostValue) {
							bestDisparity = static_cast<int>(bestDisparity*disparityFactor_
															 + static_cast<double>(rightCostValue - leftCostValue)/(centerCostValue - leftCostValue)/2.0*disparityFactor_ + 0.5);
						} else {
							bestDisparity = static_cast<int>(bestDisparity*disparityFactor_
															 + static_cast<double>(rightCostValue - leftCostValue)/(centerCostValue - rightCostValue)/2.0*disparityFactor_ + 0.5);
						}
					} else {
						bestDisparity = static_cast<int>(bestDisparity*disparityFactor_);
					}

					disparityRow[x] = static_cast<unsigned short>(bestDisparity);
				}
			}

			std::swap(pathCosts[0], pathCosts[1]);
			std::swap(pathMinCosts[0], pathMinCosts[1]);
		}
	}
	delete[] pathCosts;
	delete[] pathMinCosts;

	speckleFilter(100, static_cast<int>(2*disparityFactor_), disparityImage);
}
int
global_sse2_word(int                  queryLength,
                 unsigned short      *profile,
                 const unsigned char *dbSeq,
                 int                  dbLength,
                 unsigned short       gapOpen,
                 unsigned short       gapExtend,
                 unsigned short       ceiling,
                 struct f_struct     *f_str)
{
  int     i, j;

  int     score;
  int     scale;
  int     temp;
  int     distance;

  int     offset;
  int     position;

  int     cmp;
  int     iter;
    
  __m128i *pvH;
  __m128i *pvE;

  __m128i vE, vF, vH;
  __m128i vHNext;
  __m128i vFPrev;

  __m128i vGapOpen;
  __m128i vGapExtend;
  __m128i vCeiling;

  __m128i vScale;
  __m128i vScaleAmt;
  __m128i vScaleTmp;

  __m128i vTemp;
  __m128i vNull;

  __m128i *pvScore;

  scale = 0;
  iter = (queryLength + 7) / 8;
  offset = (queryLength - 1) % iter;
  position = 7 - (queryLength - 1) / iter;

  pvH = (__m128i *)f_str->workspace;
  pvE = pvH + iter;

  /* Load gap opening penalty to all elements of a constant */
  vGapOpen = _mm_setzero_si128();	/* transfered from Apple Devel smith_waterman_sse2.c fix */
  vGapOpen = _mm_insert_epi16 (vGapOpen, gapOpen, 0);
  vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0);
  vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0);

  /* Load gap extension penalty to all elements of a constant */
  vGapExtend = _mm_setzero_si128();	/* transfered from Apple Devel smith_waterman_sse2.c fix */
  vGapExtend = _mm_insert_epi16 (vGapExtend, gapExtend, 0);
  vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0);
  vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0);

  /* Generate the ceiling before scaling */
  vTemp = _mm_setzero_si128();	/* transfered from Apple Devel smith_waterman_sse2.c fix */
  vTemp = _mm_insert_epi16 (vTemp, ceiling, 0);
  vTemp = _mm_shufflelo_epi16 (vTemp, 0);
  vTemp = _mm_shuffle_epi32 (vTemp, 0);
  vCeiling = _mm_cmpeq_epi16 (vTemp, vTemp);
  vCeiling = _mm_srli_epi16 (vCeiling, 1);
  vCeiling = _mm_subs_epi16 (vCeiling, vTemp);
  vCeiling = _mm_subs_epi16 (vCeiling, vGapOpen);

  vNull = _mm_cmpeq_epi16 (vTemp, vTemp);
  vNull = _mm_slli_epi16 (vNull, 15);
  vScaleAmt = _mm_xor_si128 (vNull, vNull);

  /* Zero out the storage vector */
  vTemp = _mm_adds_epi16 (vNull, vGapOpen);
  for (i = 0; i < iter; i++) {
    _mm_store_si128 (pvH + i, vTemp);
    _mm_store_si128 (pvE + i, vNull);
  }

  /* initialize F */
  vF = vNull;
  vFPrev = vNull;

  /* load and scale H for the next round */
  vTemp = _mm_srli_si128 (vGapOpen, 14);
  vH = _mm_load_si128 (pvH + iter - 1);
  vH = _mm_adds_epi16 (vH, vTemp);

  for (i = 0; i < dbLength; ++i) {
    /* fetch first data asap. */
    pvScore = (__m128i *) profile + dbSeq[i] * iter;

    vF = vNull;

    vH = _mm_max_epi16 (vH, vFPrev);
    for (j = 0; j < iter; j++) {
      /* correct H from the previous columns F */
      vHNext = _mm_load_si128 (pvH + j);
      vHNext = _mm_max_epi16 (vHNext, vFPrev);

      /* load and correct E value */
      vE = _mm_load_si128 (pvE + j);
      vTemp = _mm_subs_epi16 (vHNext, vGapOpen);
      vE = _mm_max_epi16 (vE, vTemp);
      _mm_store_si128 (pvE + j, vE);

      /* add score to vH */
      vH = _mm_adds_epi16 (vH, *pvScore++);

      /* get max from vH, vE and vF */
      vH = _mm_max_epi16 (vH, vE);
      vH = _mm_max_epi16 (vH, vF);
      _mm_store_si128 (pvH + j, vH);

      /* update vF value */
      vH = _mm_subs_epi16 (vH, vGapOpen);
      vF = _mm_max_epi16 (vF, vH);

      /* load the next h values */
      vH = vHNext;
    }

    /* check if we need to scale before the next round */
    vTemp = _mm_cmpgt_epi16 (vF, vCeiling);
    cmp  = _mm_movemask_epi8 (vTemp);

    /* broadcast F values */
    vF = _mm_xor_si128 (vF, vNull);

    vTemp  = _mm_slli_si128 (vF, 2);
    vTemp = _mm_subs_epu16 (vTemp, vScaleAmt);
    vF = max_epu16 (vF, vTemp);

    vTemp  = _mm_slli_si128 (vF, 4);
    vScaleTmp = _mm_slli_si128 (vScaleAmt, 2);
    vScaleTmp = _mm_adds_epu16 (vScaleTmp, vScaleAmt);
    vTemp = _mm_subs_epu16 (vTemp, vScaleTmp);
    vF = max_epu16 (vF, vTemp);

    vTemp = _mm_slli_si128 (vScaleTmp, 4);
    vScaleTmp = _mm_adds_epu16 (vScaleTmp, vTemp);
    vTemp  = _mm_slli_si128 (vF, 8);
    vTemp = _mm_subs_epu16 (vTemp, vScaleTmp);
    vF = max_epu16 (vF, vTemp);

    /* scale if necessary */
    if (cmp != 0x0000) {
      __m128i vScale1;
      __m128i vScale2;

      vScale = _mm_slli_si128 (vF, 2);
      vScale = _mm_subs_epu16 (vScale, vGapOpen);
      vScale = _mm_subs_epu16 (vScale, vScaleAmt);

      vTemp = _mm_slli_si128 (vScale, 2);
      vTemp = _mm_subs_epu16 (vScale, vTemp);
      vScaleAmt = _mm_adds_epu16 (vScaleAmt, vTemp);
      vTemp = _mm_slli_si128 (vScale, 2);
      vTemp = _mm_subs_epu16 (vTemp, vScale);
      vScaleAmt = _mm_subs_epu16 (vScaleAmt, vTemp);

      /* rescale the previous F */
      vF = _mm_subs_epu16 (vF, vScale);

      /* check if we can continue in signed 16-bits */
      vTemp = _mm_xor_si128 (vF, vNull);
      vTemp = _mm_cmpgt_epi16 (vTemp, vCeiling);
      cmp  = _mm_movemask_epi8 (vTemp);
      if (cmp != 0x0000) {
        return OVERFLOW_SCORE;
      }

      vTemp   = _mm_adds_epi16 (vCeiling, vCeiling);
      vScale1 = _mm_subs_epu16 (vScale, vTemp);
      vScale2 = _mm_subs_epu16 (vScale, vScale1);

      /* scale all the vectors */
      for (j = 0; j < iter; j++) {
        /* load H and E */
        vH = _mm_load_si128 (pvH + j);
        vE = _mm_load_si128 (pvE + j);

        /* get max from vH, vE and vF */
        vH = _mm_subs_epi16 (vH, vScale1);
        vH = _mm_subs_epi16 (vH, vScale2);
        vE = _mm_subs_epi16 (vE, vScale1);
        vE = _mm_subs_epi16 (vE, vScale2);

        /* save the H and E */
        _mm_store_si128 (pvH + j, vH);
        _mm_store_si128 (pvE + j, vE);
      }

      vScale = vScaleAmt;
      for (j = 0; j < position; ++j) {
        vScale = _mm_slli_si128 (vScale, 2);
      }

      /* calculate the final scaling amount */
      vTemp   = _mm_xor_si128 (vTemp, vTemp);
      vScale1 = _mm_unpacklo_epi16 (vScale, vTemp);
      vScale2 = _mm_unpackhi_epi16 (vScale, vTemp);
      vScale  = _mm_add_epi32 (vScale1, vScale2);
      vTemp = _mm_srli_si128 (vScale, 8);
      vScale = _mm_add_epi32 (vScale, vTemp);
      vTemp = _mm_srli_si128 (vScale, 4);
      vScale = _mm_add_epi32 (vScale, vTemp);
      scale = (int) (unsigned short) _mm_extract_epi16 (vScale, 0);
      temp  = (int) (unsigned short) _mm_extract_epi16 (vScale, 1);
      scale = scale + (temp << 16);
    }

    /* scale the F value for the next round */
    vFPrev = _mm_slli_si128 (vF, 2);
    vFPrev = _mm_subs_epu16 (vFPrev, vScaleAmt);
    vFPrev = _mm_xor_si128 (vFPrev, vNull);

    /* load and scale H for the next round */
    vH = _mm_load_si128 (pvH + iter - 1);
    vH = _mm_xor_si128 (vH, vNull);
    vH = _mm_slli_si128 (vH, 2);
    vH = _mm_subs_epu16 (vH, vScaleAmt);
    vH = _mm_insert_epi16 (vH, gapOpen, 0);
    vH = _mm_xor_si128 (vH, vNull);
  }

  vH = _mm_load_si128 (pvH + offset);
  vH = _mm_max_epi16 (vH, vFPrev);
  for (j = 0; j < position; ++j) {
    vH = _mm_slli_si128 (vH, 2);
  }
  score = (int) (signed short) _mm_extract_epi16 (vH, 7);
  score = score + SHORT_BIAS;

  /* return largest score */
  distance = (queryLength + dbLength) * gapExtend;
  score = score - (gapOpen * 2) - distance + scale;

  return score;
}
Beispiel #19
0
static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
                                            const unsigned char *_blimit,
                                            const unsigned char *_limit,
                                            const unsigned char *_thresh) {
  __m128i mask, hev, flat, flat2;
  const __m128i zero = _mm_set1_epi16(0);
  const __m128i one = _mm_set1_epi8(1);
  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
  __m128i abs_p1p0;

  const __m128i thresh =
      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
  const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
  const __m128i blimit =
      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));

  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
  q4p4 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
  q3p3 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
  q2p2 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
  q1p1 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
  p1q1 = _mm_shuffle_epi32(q1p1, 78);
  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
  q0p0 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
  p0q0 = _mm_shuffle_epi32(q0p0, 78);

  {
    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
    abs_p1p0 =
        _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1));
    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
    fe = _mm_set1_epi8(0xfe);
    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
    abs_p0q0 =
        _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0));
    abs_p1q1 =
        _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));
    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
    hev = _mm_subs_epu8(flat, thresh);
    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);

    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
    mask = _mm_max_epu8(abs_p1p0, mask);
    // mask |= (abs(p1 - p0) > limit) * -1;
    // mask |= (abs(q1 - q0) > limit) * -1;

    work = _mm_max_epu8(
        _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)),
        _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));
    mask = _mm_max_epu8(work, mask);
    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
    mask = _mm_subs_epu8(mask, limit);
    mask = _mm_cmpeq_epi8(mask, zero);
  }

  // lp filter
  {
    const __m128i t4 = _mm_set1_epi8(4);
    const __m128i t3 = _mm_set1_epi8(3);
    const __m128i t80 = _mm_set1_epi8(0x80);
    const __m128i t1 = _mm_set1_epi16(0x1);
    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
    __m128i qs0 = _mm_xor_si128(p0q0, t80);
    __m128i qs1 = _mm_xor_si128(p1q1, t80);
    __m128i filt;
    __m128i work_a;
    __m128i filter1, filter2;
    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;

    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
    work_a = _mm_subs_epi8(qs0, qs0ps0);
    filt = _mm_adds_epi8(filt, work_a);
    filt = _mm_adds_epi8(filt, work_a);
    filt = _mm_adds_epi8(filt, work_a);
    /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
    filt = _mm_and_si128(filt, mask);

    filter1 = _mm_adds_epi8(filt, t4);
    filter2 = _mm_adds_epi8(filt, t3);

    filter1 = _mm_unpacklo_epi8(zero, filter1);
    filter1 = _mm_srai_epi16(filter1, 0xB);
    filter2 = _mm_unpacklo_epi8(zero, filter2);
    filter2 = _mm_srai_epi16(filter2, 0xB);

    /* Filter1 >> 3 */
    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);

    /* filt >> 1 */
    filt = _mm_adds_epi16(filter1, t1);
    filt = _mm_srai_epi16(filt, 1);
    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
                            filt);
    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
    // loopfilter done

    {
      __m128i work;
      flat = _mm_max_epu8(
          _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)),
          _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3)));
      flat = _mm_max_epu8(abs_p1p0, flat);
      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
      flat = _mm_subs_epu8(flat, one);
      flat = _mm_cmpeq_epi8(flat, zero);
      flat = _mm_and_si128(flat, mask);

      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
      q5p5 = _mm_castps_si128(
          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));

      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
      q6p6 = _mm_castps_si128(
          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));

      flat2 = _mm_max_epu8(
          _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),
          _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));

      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
      q7p7 = _mm_castps_si128(
          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));

      work = _mm_max_epu8(
          _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),
          _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7)));

      flat2 = _mm_max_epu8(work, flat2);
      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
      flat2 = _mm_subs_epu8(flat2, one);
      flat2 = _mm_cmpeq_epi8(flat2, zero);
      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
    }

    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    // flat and wide flat calculations
    {
      const __m128i eight = _mm_set1_epi16(8);
      const __m128i four = _mm_set1_epi16(4);
      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
      __m128i pixelFilter_p, pixelFilter_q;
      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;

      p7_16 = _mm_unpacklo_epi8(q7p7, zero);
      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
      q7_16 = _mm_unpackhi_epi8(q7p7, zero);

      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
                                    _mm_add_epi16(p4_16, p3_16));
      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
                                    _mm_add_epi16(q4_16, q3_16));

      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
      pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);

      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
      pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
      pixelFilter_p =
          _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
      pixetFilter_p2p1p0 = _mm_add_epi16(
          four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);

      flat_q0p0 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(p7_16, p7_16);
      sum_q7 = _mm_add_epi16(q7_16, q7_16);
      sum_p3 = _mm_add_epi16(p3_16, p3_16);
      sum_q3 = _mm_add_epi16(q3_16, q3_16);

      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);

      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
      flat_q1p1 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
      sum_q3 = _mm_add_epi16(sum_q3, q3_16);

      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);

      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);

      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
      flat_q2p2 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
    }
    // wide flat
    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    flat = _mm_shuffle_epi32(flat, 68);
    flat2 = _mm_shuffle_epi32(flat2, 68);

    q2p2 = _mm_andnot_si128(flat, q2p2);
    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
    q2p2 = _mm_or_si128(q2p2, flat_q2p2);

    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);

    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);

    q6p6 = _mm_andnot_si128(flat2, q6p6);
    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));

    q5p5 = _mm_andnot_si128(flat2, q5p5);
    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));

    q4p4 = _mm_andnot_si128(flat2, q4p4);
    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));

    q3p3 = _mm_andnot_si128(flat2, q3p3);
    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));

    q2p2 = _mm_andnot_si128(flat2, q2p2);
    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));

    q1p1 = _mm_andnot_si128(flat2, q1p1);
    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));

    q0p0 = _mm_andnot_si128(flat2, q0p0);
    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
    _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
  }
}