Example #1
0
void demod_64qam_lte_b_sse(const cf_t *symbols, int8_t *llr, int nsymbols)
{
  float *symbolsPtr = (float*) symbols;
  __m128i *resultPtr = (__m128i*) llr;
  __m128 symbol1, symbol2, symbol3, symbol4;
  __m128i symbol_i1, symbol_i2, symbol_i3, symbol_i4, symbol_i, symbol_abs, symbol_abs2,symbol_12, symbol_34;
  __m128i offset1 = _mm_set1_epi8(4*SCALE_BYTE_CONV_QAM64/sqrt(42));
  __m128i offset2 = _mm_set1_epi8(2*SCALE_BYTE_CONV_QAM64/sqrt(42));
  __m128 scale_v = _mm_set1_ps(-SCALE_BYTE_CONV_QAM64);
  __m128i result11, result12, result13, result22, result21,result23, result31, result32, result33;

  __m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,5,4,0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0);
  __m128i shuffle_negated_2 = _mm_set_epi8(11,10,0xff,0xff,0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff);
  __m128i shuffle_negated_3 = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff);

  __m128i shuffle_abs_1 = _mm_set_epi8(5,4,0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0,0xff,0xff);
  __m128i shuffle_abs_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff,0xff,0xff);
  __m128i shuffle_abs_3 = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff,11,10);

  __m128i shuffle_abs2_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0,0xff,0xff,0xff,0xff);
  __m128i shuffle_abs2_2 = _mm_set_epi8(0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff,0xff,0xff,5,4);
  __m128i shuffle_abs2_3 = _mm_set_epi8(15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff,11,10,0xff,0xff);

  for (int i=0;i<nsymbols/8;i++) {
    symbol1   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol2   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol3   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol4   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v));
    symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v));
    symbol_i3 = _mm_cvtps_epi32(_mm_mul_ps(symbol3, scale_v));
    symbol_i4 = _mm_cvtps_epi32(_mm_mul_ps(symbol4, scale_v));
    symbol_12  = _mm_packs_epi32(symbol_i1, symbol_i2);
    symbol_34  = _mm_packs_epi32(symbol_i3, symbol_i4);
    symbol_i   = _mm_packs_epi16(symbol_12, symbol_34);

    symbol_abs  = _mm_abs_epi8(symbol_i);
    symbol_abs  = _mm_sub_epi8(symbol_abs, offset1);
    symbol_abs2 = _mm_sub_epi8(_mm_abs_epi8(symbol_abs), offset2);

    result11 = _mm_shuffle_epi8(symbol_i, shuffle_negated_1);
    result12 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1);
    result13 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_1);

    result21 = _mm_shuffle_epi8(symbol_i, shuffle_negated_2);
    result22 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2);
    result23 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_2);

    result31 = _mm_shuffle_epi8(symbol_i, shuffle_negated_3);
    result32 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_3);
    result33 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_3);

    _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result11, result12),result13)); resultPtr++;
    _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result21, result22),result23)); resultPtr++;
    _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result31, result32),result33)); resultPtr++;

  }
  for (int i=8*(nsymbols/8);i<nsymbols;i++) {
    float yre = (int8_t) (SCALE_BYTE_CONV_QAM64*crealf(symbols[i]));
    float yim = (int8_t) (SCALE_BYTE_CONV_QAM64*cimagf(symbols[i]));

    llr[6*i+0] = -yre;
    llr[6*i+1] = -yim;
    llr[6*i+2] = abs(yre)-4*SCALE_BYTE_CONV_QAM64/sqrt(42);
    llr[6*i+3] = abs(yim)-4*SCALE_BYTE_CONV_QAM64/sqrt(42);
    llr[6*i+4] = abs(llr[6*i+2])-2*SCALE_BYTE_CONV_QAM64/sqrt(42);
    llr[6*i+5] = abs(llr[6*i+3])-2*SCALE_BYTE_CONV_QAM64/sqrt(42);
  }
}
Example #2
0
bool WidgetAugmentedView::render()
{
    if (!stream) return false;
    stream->getColorFrame(colorFrame);
    stream->getDepthFrame(depthFrame);

    // Correct the depth map
    if (depthCorrector == nullptr) depthBuffer = depthFrame;
    else depthCorrector->correct(depthFrame, depthBuffer);

    // Setup perspective
    glMatrixMode(GL_PROJECTION);
    glLoadIdentity();
    gluPerspective(fovY, float(ColorFrame::WIDTH) / float(ColorFrame::HEIGHT), zNear, zFar);

    glMatrixMode(GL_MODELVIEW);
    glLoadIdentity();


    glEnable(GL_DEPTH_TEST);
    glColor4f(1.0f, 1.0f, 1.0f, 1.0f);


    //
    // Draw real world (2D color image)
    //

    glDepthFunc(GL_ALWAYS);

    glActiveTexture(GL_TEXTURE0);

    glBindTexture(GL_TEXTURE_2D, textureColor);
    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, ColorFrame::WIDTH, ColorFrame::HEIGHT,
        GL_RGBA, GL_UNSIGNED_BYTE, (GLvoid*)colorFrame.pixels);

    glActiveTexture(GL_TEXTURE1);

    glBindTexture(GL_TEXTURE_2D, textureDepth);

    KinectStream* kinect = dynamic_cast<KinectStream*>(stream.obj);
    if (kinect != nullptr) {
        kinect->mapColorFrameToDepthFrame(depthBuffer, OUT mapping);

        const NUI_DEPTH_IMAGE_POINT* src = mapping;
        GLushort* dest = textureDepthBuffer;
        GLushort* end = textureDepthBuffer + ColorFrame::SIZE;

        #define SRC(i) static_cast<short>(static_cast<unsigned short>((src + i)->depth))

        #ifndef NOT_VECTORIZED
            // Vectorized assuming ColorFrame::SIZE % 8 == 0
            __m128i min = _mm_set1_epi16(static_cast<short>(DepthFrame::MIN_DEPTH));
            __m128i max = _mm_set1_epi16(static_cast<short>(DepthFrame::MAX_DEPTH));
            __m128i _0 = _mm_setzero_si128();
            for (; dest < end; dest += 8, src += 8) {
                __m128i v = _mm_set_epi16(SRC(7), SRC(6), SRC(5), SRC(4), SRC(3), SRC(2), SRC(1), SRC(0));
                v = _mm_max_epu16(min, _mm_min_epu16(max, v));
                v = _mm_blendv_epi8(v, max, _mm_cmpeq_epi16(_0, v));
                _mm_store_si128((__m128i*)dest, v);
            }
        #else
            for (; dest < end; ++dest, ++src) {
                unsigned short s = SRC(0);
                s = (s > DepthFrame::MAX_DEPTH) ? DepthFrame::MAX_DEPTH : s;
                s = (s < DepthFrame::MIN_DEPTH) ? DepthFrame::MIN_DEPTH : s;
                *dest = static_cast<GLushort>(s);
            }
        #endif

        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, ColorFrame::WIDTH, ColorFrame::HEIGHT,
            GL_RED_INTEGER, GL_UNSIGNED_SHORT, (GLvoid*)textureDepthBuffer);
    }

    glActiveTexture(GL_TEXTURE0);

    shader2D.bind();

    RenderUtils::drawRect(-1.0f, 1.0f, 2.0f, -2.0f);

    shader2D.release();

    //
    // Draw augmented world
    //

    glDepthFunc(GL_LESS);

    glScalef(1.0f, 1.0f, -1.0f); // Invert Z axis so that +Z is in front of the camera

    // A plane to test occlusion
    /*glColor3f(0.0f, 1.0f, 0.0f);
    glBegin(GL_TRIANGLE_STRIP);
        glVertex3f(-0.5f, -0.5f, 0.5f);
        glVertex3f(-0.5f, 0.5f, 2.5f);
        glVertex3f(0.5f, -0.5f, 2.5f);
        glVertex3f(0.5f, 0.5f, 4.5f);
    glEnd();*/

    glEnable(GL_LIGHTING);

    // Draw the objects
    world.render(renderManager);

    glDisable(GL_LIGHTING);

    return true;
}
int camCompareDescriptors(const int *desc1, const int *desc2, const int s)
{
    int i, j, distance = 0;
    __m128i sum, d1, d2, md, d, cmp;
    __m128i *p1 = (__m128i*)desc1, *p2 = (__m128i*)desc2;
    ALIGN(int out_sse[4], 16);

    /* Looks like a good idea... But this deteriorates performance...
    // Software prefetch
    d1 = _mm_load_si128(p1);
    d2 = _mm_load_si128(p2);
    for (i = 0; i != s; i += 32) {
	_mm_prefetch(&desc1[i], _MM_HINT_NTA);
	_mm_prefetch(&desc2[i], _MM_HINT_NTA);
    }
    */

    sum = _mm_setzero_si128();
    for (i = 0; i != s >> 4; i++) {
	// 32-bits SAD for 4 integers in parallel
	d1 = _mm_loadu_si128(p1++);
	d2 = _mm_loadu_si128(p2++);
	d = _mm_sub_epi32(d1, d2);
	md = _mm_sub_epi32(d2, d1);
	cmp = _mm_cmplt_epi32(d, _mm_setzero_si128());
	md = _mm_and_si128(cmp, md);
	d = _mm_andnot_si128(cmp, d);
	sum = _mm_add_epi32(sum, md);
	sum = _mm_add_epi32(sum, d);

	// 32-bits SAD for 4 integers in parallel
	d1 = _mm_loadu_si128(p1++);
	d2 = _mm_loadu_si128(p2++);
	d = _mm_sub_epi32(d1, d2);
	md = _mm_sub_epi32(d2, d1);
	cmp = _mm_cmplt_epi32(d, _mm_setzero_si128());
	md = _mm_and_si128(cmp, md);
	d = _mm_andnot_si128(cmp, d);
	sum = _mm_add_epi32(sum, md);
	sum = _mm_add_epi32(sum, d);

	// 32-bits SAD for 4 integers in parallel
	d1 = _mm_loadu_si128(p1++);
	d2 = _mm_loadu_si128(p2++);
	d = _mm_sub_epi32(d1, d2);
	md = _mm_sub_epi32(d2, d1);
	cmp = _mm_cmplt_epi32(d, _mm_setzero_si128());
	md = _mm_and_si128(cmp, md);
	d = _mm_andnot_si128(cmp, d);
	sum = _mm_add_epi32(sum, md);
	sum = _mm_add_epi32(sum, d);

	// 32-bits SAD for 4 integers in parallel
	d1 = _mm_loadu_si128(p1++);
	d2 = _mm_loadu_si128(p2++);
	d = _mm_sub_epi32(d1, d2);
	md = _mm_sub_epi32(d2, d1);
	cmp = _mm_cmplt_epi32(d, _mm_setzero_si128());
	md = _mm_and_si128(cmp, md);
	d = _mm_andnot_si128(cmp, d);
	sum = _mm_add_epi32(sum, md);
	sum = _mm_add_epi32(sum, d);
    }
    _mm_store_si128((__m128i*)out_sse, sum);
    return out_sse[0] + out_sse[1] + out_sse[2] + out_sse[3];
}
Example #4
0
pstatus_t sse2_alphaComp_argb(
    const BYTE* pSrc1,  UINT32 src1Step,
    const BYTE* pSrc2,  UINT32 src2Step,
    BYTE* pDst,  UINT32 dstStep,
    UINT32 width,  UINT32 height)
{
	const UINT32* sptr1 = (const UINT32*) pSrc1;
	const UINT32* sptr2 = (const UINT32*) pSrc2;
	UINT32* dptr;
	int linebytes, src1Jump, src2Jump, dstJump;
	UINT32 y;
	__m128i xmm0, xmm1;

	if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS;

	if (width < 4)     /* pointless if too small */
	{
		return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step,
					       pDst, dstStep, width, height);
	}

	dptr = (UINT32*) pDst;
	linebytes = width * sizeof(UINT32);
	src1Jump = (src1Step - linebytes) / sizeof(UINT32);
	src2Jump = (src2Step - linebytes) / sizeof(UINT32);
	dstJump  = (dstStep  - linebytes) / sizeof(UINT32);
	xmm0 = _mm_set1_epi32(0);
	xmm1 = _mm_set1_epi16(1);

	for (y = 0; y < height; ++y)
	{
		int pixels = width;
		int count;
		/* Get to the 16-byte boundary now. */
		int leadIn = 0;

		switch ((ULONG_PTR) dptr & 0x0f)
		{
			case 0:
				leadIn = 0;
				break;

			case 4:
				leadIn = 3;
				break;

			case 8:
				leadIn = 2;
				break;

			case 12:
				leadIn = 1;
				break;

			default:
				/* We'll never hit a 16-byte boundary, so do the whole
				 * thing the slow way.
				 */
				leadIn = width;
				break;
		}

		if (leadIn)
		{
			pstatus_t status;
			status = generic->alphaComp_argb((const BYTE*) sptr1,
						src1Step, (const BYTE*) sptr2, src2Step,
						(BYTE*) dptr, dstStep, leadIn, 1);
			if (status != PRIMITIVES_SUCCESS)
				return status;

			sptr1 += leadIn;
			sptr2 += leadIn;
			dptr  += leadIn;
			pixels -= leadIn;
		}

		/* Use SSE registers to do 4 pixels at a time. */
		count = pixels >> 2;
		pixels -= count << 2;

		while (count--)
		{
			__m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
			/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
			xmm2 = LOAD_SI128(sptr1);
			sptr1 += 4;
			/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
			xmm3 = LOAD_SI128(sptr2);
			sptr2 += 4;
			/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
			xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
			/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
			xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
			/* subtract */
			xmm6 = _mm_subs_epi16(xmm4, xmm5);
			/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
			xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
			/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
			xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
			/* Add one to alphas */
			xmm4 = _mm_adds_epi16(xmm4, xmm1);
			/* Multiply and take low word */
			xmm4 = _mm_mullo_epi16(xmm4, xmm6);
			/* Shift 8 right */
			xmm4 = _mm_srai_epi16(xmm4, 8);
			/* Add xmm5 */
			xmm4 = _mm_adds_epi16(xmm4, xmm5);
			/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
			/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
			xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
			/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
			xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
			/* subtract */
			xmm7 = _mm_subs_epi16(xmm5, xmm6);
			/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
			xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
			/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
			xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
			/* Add one to alphas */
			xmm5 = _mm_adds_epi16(xmm5, xmm1);
			/* Multiply and take low word */
			xmm5 = _mm_mullo_epi16(xmm5, xmm7);
			/* Shift 8 right */
			xmm5 = _mm_srai_epi16(xmm5, 8);
			/* Add xmm6 */
			xmm5 = _mm_adds_epi16(xmm5, xmm6);
			/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
			/* Must mask off remainders or pack gets confused */
			xmm3 = _mm_set1_epi16(0x00ffU);
			xmm4 = _mm_and_si128(xmm4, xmm3);
			xmm5 = _mm_and_si128(xmm5, xmm3);
			/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
			xmm5 = _mm_packus_epi16(xmm5, xmm4);
			_mm_store_si128((__m128i*) dptr, xmm5);
			dptr += 4;
		}

		/* Finish off the remainder. */
		if (pixels)
		{
			pstatus_t status;
			status = generic->alphaComp_argb((const BYTE*) sptr1, src1Step,
						(const BYTE*) sptr2, src2Step,
						(BYTE*) dptr, dstStep, pixels, 1);
			if (status != PRIMITIVES_SUCCESS)
				return status;

			sptr1 += pixels;
			sptr2 += pixels;
			dptr  += pixels;
		}

		/* Jump to next row. */
		sptr1 += src1Jump;
		sptr2 += src2Jump;
		dptr  += dstJump;
	}

	return PRIMITIVES_SUCCESS;
}
static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
                                         ptrdiff_t src_pitch,
                                         uint8_t *output_ptr,
                                         ptrdiff_t out_pitch,
                                         uint32_t output_height,
                                         const int16_t *filter) {
  __m128i filtersReg;
  __m256i addFilterReg64;
  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
  __m256i srcReg32b11, srcReg32b12, filtersReg32;
  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
  unsigned int i;
  ptrdiff_t src_stride, dst_stride;

  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  // converting the 16 bit (short) to  8 bit (byte) and have the
  // same data in both lanes of 128 bit register.
  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
  // have the same data in both lanes of a 256 bit register
  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);

  // duplicate only the first 16 bits (first and second byte)
  // across 256 bit register
  firstFilters = _mm256_shuffle_epi8(filtersReg32,
                 _mm256_set1_epi16(0x100u));
  // duplicate only the second 16 bits (third and forth byte)
  // across 256 bit register
  secondFilters = _mm256_shuffle_epi8(filtersReg32,
                  _mm256_set1_epi16(0x302u));
  // duplicate only the third 16 bits (fifth and sixth byte)
  // across 256 bit register
  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
                 _mm256_set1_epi16(0x504u));
  // duplicate only the forth 16 bits (seventh and eighth byte)
  // across 256 bit register
  forthFilters = _mm256_shuffle_epi8(filtersReg32,
                 _mm256_set1_epi16(0x706u));

  // multiple the size of the source and destination stride by two
  src_stride = src_pitch << 1;
  dst_stride = out_pitch << 1;

  // load 16 bytes 7 times in stride of src_pitch
  srcReg32b1 = _mm256_castsi128_si256(
               _mm_loadu_si128((const __m128i *)(src_ptr)));
  srcReg32b2 = _mm256_castsi128_si256(
               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
  srcReg32b3 = _mm256_castsi128_si256(
               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
  srcReg32b4 = _mm256_castsi128_si256(
               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
  srcReg32b5 = _mm256_castsi128_si256(
               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
  srcReg32b6 = _mm256_castsi128_si256(
               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
  srcReg32b7 = _mm256_castsi128_si256(
               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));

  // have each consecutive loads on the same 256 register
  srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
               _mm256_castsi256_si128(srcReg32b2), 1);
  srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
               _mm256_castsi256_si128(srcReg32b3), 1);
  srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
               _mm256_castsi256_si128(srcReg32b4), 1);
  srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
               _mm256_castsi256_si128(srcReg32b5), 1);
  srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
               _mm256_castsi256_si128(srcReg32b6), 1);
  srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
               _mm256_castsi256_si128(srcReg32b7), 1);

  // merge every two consecutive registers except the last one
  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);

  // save
  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);

  // save
  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);

  // save
  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);

  // save
  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);


  for (i = output_height; i > 1; i-=2) {
     // load the last 2 loads of 16 bytes and have every two
     // consecutive loads in the same 256 bit register
     srcReg32b8 = _mm256_castsi128_si256(
     _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
     srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
     _mm256_castsi256_si128(srcReg32b8), 1);
     srcReg32b9 = _mm256_castsi128_si256(
     _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
     srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
     _mm256_castsi256_si128(srcReg32b9), 1);

     // merge every two consecutive registers
     // save
     srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
     srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);

     // multiply 2 adjacent elements with the filter and add the result
     srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);

     // add and saturate the results together
     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);

     // multiply 2 adjacent elements with the filter and add the result
     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);

     // add and saturate the results together
     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
                   _mm256_min_epi16(srcReg32b8, srcReg32b12));
     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
                   _mm256_max_epi16(srcReg32b8, srcReg32b12));

     // multiply 2 adjacent elements with the filter and add the result
     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);

     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);

     // multiply 2 adjacent elements with the filter and add the result
     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);

     // add and saturate the results together
     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
                  _mm256_min_epi16(srcReg32b8, srcReg32b12));
     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
                  _mm256_max_epi16(srcReg32b8, srcReg32b12));

     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);

     // shift by 7 bit each 16 bit
     srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
     srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);

     // shrink to 8 bit each 16 bits, the first lane contain the first
     // convolve result and the second lane contain the second convolve
     // result
     srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);

     src_ptr+=src_stride;

     // save 16 bytes
     _mm_store_si128((__m128i*)output_ptr,
     _mm256_castsi256_si128(srcReg32b1));

     // save the next 16 bits
     _mm_store_si128((__m128i*)(output_ptr+out_pitch),
     _mm256_extractf128_si256(srcReg32b1, 1));

     output_ptr+=dst_stride;

     // save part of the registers for next strides
     srcReg32b10 = srcReg32b11;
     srcReg32b1 = srcReg32b3;
     srcReg32b11 = srcReg32b2;
     srcReg32b3 = srcReg32b5;
     srcReg32b2 = srcReg32b4;
     srcReg32b5 = srcReg32b7;
     srcReg32b7 = srcReg32b9;
  }
  if (i > 0) {
    __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
    __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
    // load the last 16 bytes
    srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));

    // merge the last 2 results together
    srcRegFilt4 = _mm_unpacklo_epi8(
                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
    srcRegFilt7 = _mm_unpackhi_epi8(
                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
                  _mm256_castsi256_si128(firstFilters));
    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,
                  _mm256_castsi256_si128(forthFilters));
    srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
                  _mm256_castsi256_si128(firstFilters));
    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,
                  _mm256_castsi256_si128(forthFilters));

    // add and saturate the results together
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);


    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
                  _mm256_castsi256_si128(secondFilters));
    srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
                  _mm256_castsi256_si128(secondFilters));

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
                  _mm256_castsi256_si128(thirdFilters));
    srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
                  _mm256_castsi256_si128(thirdFilters));

    // add and saturate the results together
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
                  _mm_min_epi16(srcRegFilt4, srcRegFilt6));
    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
                  _mm_min_epi16(srcRegFilt5, srcRegFilt7));

    // add and saturate the results together
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
                  _mm_max_epi16(srcRegFilt4, srcRegFilt6));
    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
                  _mm_max_epi16(srcRegFilt5, srcRegFilt7));


    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
                  _mm256_castsi256_si128(addFilterReg64));
    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
                  _mm256_castsi256_si128(addFilterReg64));

    // shift by 7 bit each 16 bit
    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve
    // result
    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);

    // save 16 bytes
    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
  }
}
mlib_status
mlib_VideoColorBGR2JFIFYCC444_S16_aligned(
	mlib_s16 *y,
	mlib_s16 *cb,
	mlib_s16 *cr,
	const mlib_s16 *bgr,
	mlib_s32 n)
{
	/* 0.299*32768 */
	const __m128i x_c11 = _mm_set1_epi16(9798);

	/* 0.587*32768 */
	const __m128i x_c12 = _mm_set1_epi16(19235);

	/* 0.114*32768 */
	const __m128i x_c13 = _mm_set1_epi16(3735);

	/* -0.16874*32768 */
	const __m128i x_c21 = _mm_set1_epi16(-5529);

	/* -0.33126*32768 */
	const __m128i x_c22 = _mm_set1_epi16(-10855);

	/* 0.5*32768 */
	const __m128i x_c23 = _mm_set1_epi16(16384);

	/* 0.5*32768 */
	const __m128i x_c31 = x_c23;

	/* -0.41869*32768 */
	const __m128i x_c32 = _mm_set1_epi16(-13720);

	/* -0.08131*32768 */
	const __m128i x_c33 = _mm_set1_epi16(-2664);

	/* 2048 */
	const __m128i x_coff = _mm_set1_epi16(2048 << 2);

	const __m128i x_zero = _mm_setzero_si128();

	__m128i x_bgr0, x_bgr1, x_bgr2, x_r, x_g, x_b;
	__m128i x_y, x_cb, x_cr;
	__m128i x_t0, x_t1, x_t2, x_t3, x_t4, x_t5;
	__m128i *px_y, *px_cb, *px_cr, *px_bgr;
	mlib_d64 fr, fg, fb, fy, fcb, fcr;
	mlib_s32 i;

	px_y = (__m128i *)y;
	px_cb = (__m128i *)cb;
	px_cr = (__m128i *)cr;
	px_bgr = (__m128i *)bgr;
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
	for (i = 0; i <= (n - 8); i += 8) {
		x_bgr0 = _mm_load_si128(px_bgr++);
		x_bgr0 = _mm_slli_epi16(x_bgr0, 3);
		x_bgr1 = _mm_load_si128(px_bgr++);
		x_bgr1 = _mm_slli_epi16(x_bgr1, 3);
		x_bgr2 = _mm_load_si128(px_bgr++);
		x_bgr2 = _mm_slli_epi16(x_bgr2, 3);
		SeparateBGR48_S16;

		x_t0 = _mm_mulhi_epi16(x_r, x_c11);
		x_t1 = _mm_mulhi_epi16(x_g, x_c12);
		x_t2 = _mm_mulhi_epi16(x_b, x_c13);
		x_y = _mm_add_epi16(x_t0, x_t1);
		x_y = _mm_add_epi16(x_y, x_t2);

		x_t0 = _mm_mulhi_epi16(x_r, x_c21);
		x_t1 = _mm_mulhi_epi16(x_g, x_c22);
		x_t2 = _mm_mulhi_epi16(x_b, x_c23);
		x_cb = _mm_add_epi16(x_t0, x_t1);
		x_cb = _mm_add_epi16(x_cb, x_coff);
		x_cb = _mm_add_epi16(x_cb, x_t2);

		x_t0 = _mm_mulhi_epi16(x_r, x_c31);
		x_t1 = _mm_mulhi_epi16(x_g, x_c32);
		x_t2 = _mm_mulhi_epi16(x_b, x_c33);
		x_cr = _mm_add_epi16(x_t0, x_t1);
		x_cr = _mm_add_epi16(x_cr, x_coff);
		x_cr = _mm_add_epi16(x_cr, x_t2);

		/* save */
		x_y = _mm_srli_epi16(x_y, 2);
		x_cb = _mm_srli_epi16(x_cb, 2);
		x_cr = _mm_srli_epi16(x_cr, 2);
		_mm_store_si128(px_y++, x_y);
		_mm_store_si128(px_cb++, x_cb);
		_mm_store_si128(px_cr++, x_cr);
	}

	if (i <= (n - 4)) {
		x_bgr0 = _mm_load_si128(px_bgr++);
		x_bgr0 = _mm_slli_epi16(x_bgr0, 3);
		x_bgr1 = _mm_loadl_epi64(px_bgr);
		x_bgr1 = _mm_slli_epi16(x_bgr1, 3);
		px_bgr = (__m128i *)((__m64 *)px_bgr + 1);
		SeparateBGR24_S16;

		x_t0 = _mm_mulhi_epi16(x_r, x_c11);
		x_t1 = _mm_mulhi_epi16(x_g, x_c12);
		x_t2 = _mm_mulhi_epi16(x_b, x_c13);
		x_y = _mm_add_epi16(x_t0, x_t1);
		x_y = _mm_add_epi16(x_y, x_t2);

		x_t0 = _mm_mulhi_epi16(x_r, x_c21);
		x_t1 = _mm_mulhi_epi16(x_g, x_c22);
		x_t2 = _mm_mulhi_epi16(x_b, x_c23);
		x_cb = _mm_add_epi16(x_t0, x_t1);
		x_cb = _mm_add_epi16(x_cb, x_coff);
		x_cb = _mm_add_epi16(x_cb, x_t2);

		x_t0 = _mm_mulhi_epi16(x_r, x_c31);
		x_t1 = _mm_mulhi_epi16(x_g, x_c32);
		x_t2 = _mm_mulhi_epi16(x_b, x_c33);
		x_cr = _mm_add_epi16(x_t0, x_t1);
		x_cr = _mm_add_epi16(x_cr, x_coff);
		x_cr = _mm_add_epi16(x_cr, x_t2);

		/* save */
		x_y = _mm_srli_epi16(x_y, 2);
		x_cb = _mm_srli_epi16(x_cb, 2);
		x_cr = _mm_srli_epi16(x_cr, 2);
		_mm_storel_epi64(px_y, x_y);
		px_y = (__m128i *)((__m64 *)px_y + 1);
		_mm_storel_epi64(px_cb, x_cb);
		px_cb = (__m128i *)((__m64 *)px_cb + 1);
		_mm_storel_epi64(px_cr, x_cr);
		px_cr = (__m128i *)((__m64 *)px_cr + 1);

		i += 4;
	}

	for (; i <= (n - 1); i++) {
		fb = bgr[3 * i];
		fg = bgr[3 * i + 1];
		fr = bgr[3 * i + 2];

		fy = 0.29900f * fr + 0.58700f * fg + 0.11400f * fb;
		fcb = -0.16874f * fr - 0.33126f * fg + 0.50000f * fb + 2048;
		fcr = 0.50000f * fr - 0.41869f * fg - 0.08131f * fb + 2048;

		y[i] = (mlib_s16)fy;
		cb[i] = (mlib_s16)fcb;
		cr[i] = (mlib_s16)fcr;
	}

	return (MLIB_SUCCESS);
}
Example #7
0
QT_BEGIN_NAMESPACE

// Convert a scanline of RGB888 (src) to RGB32 (dst)
// src must be at least len * 3 bytes
// dst must be at least len * 4 bytes
Q_GUI_EXPORT void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 *dst, const uchar *src, int len)
{
    quint32 *const end = dst + len;

    // Prologue, align dst to 16 bytes. The alignment is done on dst because it has 4 store()
    // for each 3 load() of src.
    const int offsetToAlignOn16Bytes = (4 - ((reinterpret_cast<quintptr>(dst) >> 2) & 0x3)) & 0x3;
    const int prologLength = qMin(len, offsetToAlignOn16Bytes);

    for (int i = 0; i < prologLength; ++i) {
        *dst++ = qRgb(src[0], src[1], src[2]);
        src += 3;
    }

    // Mask the 4 first colors of the RGB888 vector
    const __m128i shuffleMask = _mm_set_epi8(char(0xff), 9, 10, 11, char(0xff), 6, 7, 8, char(0xff), 3, 4, 5, char(0xff), 0, 1, 2);

    // Mask the 4 last colors of a RGB888 vector with an offset of 1 (so the last 3 bytes are RGB)
    const __m128i shuffleMaskEnd = _mm_set_epi8(char(0xff), 13, 14, 15, char(0xff), 10, 11, 12, char(0xff), 7, 8, 9, char(0xff), 4, 5, 6);

    // Mask to have alpha = 0xff
    const __m128i alphaMask = _mm_set1_epi32(0xff000000);

    __m128i *inVectorPtr = (__m128i *)src;
    __m128i *dstVectorPtr = (__m128i *)dst;

    const int simdRoundCount = (len - prologLength) / 16; // one iteration in the loop converts 16 pixels
    for (int i = 0; i < simdRoundCount; ++i) {
        /*
         RGB888 has 5 pixels per vector, + 1 byte from the next pixel. The idea here is
         to load vectors of RGB888 and use palignr to select a vector out of two vectors.

         After 3 loads of RGB888 and 3 stores of RGB32, we have 4 pixels left in the last
         vector of RGB888, we can mask it directly to get a last store or RGB32. After that,
         the first next byte is a R, and we can loop for the next 16 pixels.

         The conversion itself is done with a byte permutation (pshufb).
         */
        __m128i firstSrcVector = _mm_lddqu_si128(inVectorPtr);
        __m128i outputVector = _mm_shuffle_epi8(firstSrcVector, shuffleMask);
        _mm_store_si128(dstVectorPtr, _mm_or_si128(outputVector, alphaMask));
        ++inVectorPtr;
        ++dstVectorPtr;

        // There are 4 unused bytes left in srcVector, we need to load the next 16 bytes
        // and load the next input with palignr
        __m128i secondSrcVector = _mm_lddqu_si128(inVectorPtr);
        __m128i srcVector = _mm_alignr_epi8(secondSrcVector, firstSrcVector, 12);
        outputVector = _mm_shuffle_epi8(srcVector, shuffleMask);
        _mm_store_si128(dstVectorPtr, _mm_or_si128(outputVector, alphaMask));
        ++inVectorPtr;
        ++dstVectorPtr;
        firstSrcVector = secondSrcVector;

        // We now have 8 unused bytes left in firstSrcVector
        secondSrcVector = _mm_lddqu_si128(inVectorPtr);
        srcVector = _mm_alignr_epi8(secondSrcVector, firstSrcVector, 8);
        outputVector = _mm_shuffle_epi8(srcVector, shuffleMask);
        _mm_store_si128(dstVectorPtr, _mm_or_si128(outputVector, alphaMask));
        ++inVectorPtr;
        ++dstVectorPtr;

        // There are now 12 unused bytes in firstSrcVector.
        // We can mask them directly, almost there.
        outputVector = _mm_shuffle_epi8(secondSrcVector, shuffleMaskEnd);
        _mm_store_si128(dstVectorPtr, _mm_or_si128(outputVector, alphaMask));
        ++dstVectorPtr;
    }
    src = (uchar *)inVectorPtr;
    dst = (quint32 *)dstVectorPtr;

    while (dst != end) {
        *dst++ = qRgb(src[0], src[1], src[2]);
        src += 3;
    }
}
Example #8
0
test (__m128i *p, __m128i a)
{
  return _mm_store_si128 (p, a); 
}
Example #9
0
static  int blake64_compress( state * state, const u8 * datablock ) {

  __m128i row1a,row1b;
  __m128i row2a,row2b;
  __m128i row3a,row3b;
  __m128i row4a,row4b;
  __m128i buf1a,buf2a;
  static const u8 rot16[16] = {2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9};
  __m128i r16 = _mm_load_si128((__m128i*)rot16);


  u64 m[16]; 
  u64 y[16]; 

  /* constants and permutation */
  static const int sig[][16] = {
    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
    { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
    { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
    {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
    {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
    {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
    { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
    { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
    {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
    { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 }, 
    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
    { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
    { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
    {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
  };

  static const u64 z[16] = {
    0x243F6A8885A308D3ULL,0x13198A2E03707344ULL,
    0xA4093822299F31D0ULL,0x082EFA98EC4E6C89ULL,
    0x452821E638D01377ULL,0xBE5466CF34E90C6CULL,
    0xC0AC29B7C97C50DDULL,0x3F84D5B5B5470917ULL,
    0x9216D5D98979FB1BULL,0xD1310BA698DFB5ACULL,
    0x2FFD72DBD01ADFB7ULL,0xB8E1AFED6A267E96ULL,
    0xBA7C9045F12C7F99ULL,0x24A19947B3916CF7ULL,
    0x0801F2E2858EFC16ULL,0x636920D871574E69ULL
  };

  /* get message */
  m[ 0] = U8TO64(datablock +  0);
  m[ 1] = U8TO64(datablock +  8);
  m[ 2] = U8TO64(datablock + 16);
  m[ 3] = U8TO64(datablock + 24);
  m[ 4] = U8TO64(datablock + 32);
  m[ 5] = U8TO64(datablock + 40);
  m[ 6] = U8TO64(datablock + 48);
  m[ 7] = U8TO64(datablock + 56);
  m[ 8] = U8TO64(datablock + 64);
  m[ 9] = U8TO64(datablock + 72);
  m[10] = U8TO64(datablock + 80);
  m[11] = U8TO64(datablock + 88);
  m[12] = U8TO64(datablock + 96);
  m[13] = U8TO64(datablock +104);
  m[14] = U8TO64(datablock +112);
  m[15] = U8TO64(datablock +120);

  row1b = _mm_set_epi64((__m64)state->h[3],(__m64)state->h[2]);
  row1a = _mm_set_epi64((__m64)state->h[1],(__m64)state->h[0]);
  row2b = _mm_set_epi64((__m64)state->h[7],(__m64)state->h[6]);
  row2a = _mm_set_epi64((__m64)state->h[5],(__m64)state->h[4]);
  row3b = _mm_set_epi64((__m64)0x082EFA98EC4E6C89ULL,
			(__m64)0xA4093822299F31D0ULL);
  row3a = _mm_set_epi64((__m64)0x13198A2E03707344ULL,
			(__m64)0x243F6A8885A308D3ULL);
  
  if (state->nullt) {
    row4b = _mm_set_epi64((__m64)0x3F84D5B5B5470917ULL,
			  (__m64)0xC0AC29B7C97C50DDULL);
    row4a = _mm_set_epi64((__m64)0xBE5466CF34E90C6CULL,
			  (__m64)0x452821E638D01377ULL);
  }
  else {
    row4b = _mm_set_epi64((__m64)(0x3F84D5B5B5470917ULL^state->t[1]),
			  (__m64)(0xC0AC29B7C97C50DDULL^state->t[1]));
    row4a = _mm_set_epi64((__m64)(0xBE5466CF34E90C6CULL^state->t[0]),
			  (__m64)(0x452821E638D01377ULL^state->t[0]));
  }
  /* initialization ok (beware of bug on Celeron and P4!) */

  

#define round(r)\
    /* column step */\
    /***************************************************/\
    /* high-order side: words 0, 1, 4, 5, 8, 9, 12, 13  */		\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][ 2]], (__m64)m[sig[r][ 0]] );	\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][ 3]], (__m64)z[sig[r][ 1]] );	\
    buf1a = _mm_xor_si128( buf1a, buf2a );					\
    row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a );		\
    row4a = _mm_xor_si128( row4a, row1a );				\
    row4a = _mm_shuffle_epi32(row4a, 0xB1); \
    row3a = _mm_add_epi64( row3a, row4a );				\
    row2a = _mm_xor_si128( row2a, row3a );				\
    row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 25 ),_mm_slli_epi64( row2a, 39 )); \
  									\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][ 3]], (__m64)m[sig[r][ 1]] );	\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][ 2]], (__m64)z[sig[r][ 0]] );	\
    buf1a = _mm_xor_si128( buf1a, buf2a );					\
    row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a );		\
    row4a = _mm_xor_si128( row4a, row1a );				\
    row4a = _mm_shuffle_epi8(row4a, r16); \
    row3a = _mm_add_epi64( row3a, row4a );				\
    row2a = _mm_xor_si128( row2a, row3a );				\
    row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 11 ),_mm_slli_epi64( row2a, 53 )); \
  									\
    /* same stuff for low-order side */\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][ 6]], (__m64)m[sig[r][ 4]] );\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][ 7]], (__m64)z[sig[r][ 5]] );\
    buf1a = _mm_xor_si128( buf1a, buf2a );				\
    row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b );		\
    row4b = _mm_xor_si128( row4b, row1b );				\
    row4b = _mm_shuffle_epi32(row4b, 0xB1); \
    row3b = _mm_add_epi64( row3b, row4b );				\
    row2b = _mm_xor_si128( row2b, row3b );				\
    row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 25 ),_mm_slli_epi64( row2b, 39 )); \
\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][ 7]], (__m64)m[sig[r][ 5]] );	\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][ 6]], (__m64)z[sig[r][ 4]] );	\
    buf1a = _mm_xor_si128( buf1a, buf2a );					\
    row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b );		\
    row4b = _mm_xor_si128( row4b, row1b );				\
    row4b = _mm_shuffle_epi8(row4b, r16); \
    row3b = _mm_add_epi64( row3b, row4b );				\
    row2b = _mm_xor_si128( row2b, row3b );				\
    row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 11 ),_mm_slli_epi64( row2b, 53 )); \
\
    /* shuffle */\
    _mm_store_si128( 0+ (__m128i *)y, row4a); \
    _mm_store_si128( 1+ (__m128i *)y, row4b); \
    row4a = row3a;\
    row3a = row3b;\
    row3b = row4a;\
    row4a  = _mm_set_epi64( (__m64)y[0], (__m64)y[3] );\
    row4b  = _mm_set_epi64( (__m64)y[2], (__m64)y[1] );\
    _mm_store_si128( 0+ (__m128i *)y, row2a);  \
    _mm_store_si128( 1+ (__m128i *)y, row2b);  \
    row2a  = _mm_set_epi64( (__m64)y[2], (__m64)y[1] );  \
    row2b  = _mm_set_epi64( (__m64)y[0], (__m64)y[3] );  \
    /* diagonal step */\
    /***************************************************/\
    /* high-order side: words 0, 1, 4, 5, 8, 9, 12, 13  */\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][10]], (__m64)m[sig[r][ 8]] );\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][11]], (__m64)z[sig[r][ 9]] );\
    buf1a = _mm_xor_si128( buf1a, buf2a );\
    row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a );\
    row4a = _mm_xor_si128( row4a, row1a );		      \
    row4a = _mm_shuffle_epi32(row4a, 0xB1); \
    row3a = _mm_add_epi64( row3a, row4a );					\
    row2a = _mm_xor_si128( row2a, row3a );					\
    row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 25 ),_mm_slli_epi64( row2a, 39 )); \
\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][11]], (__m64)m[sig[r][ 9]] );\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][10]], (__m64)z[sig[r][ 8]] );\
    buf1a = _mm_xor_si128( buf1a, buf2a );\
    row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a );\
    row4a = _mm_xor_si128( row4a, row1a );			\
    row4a = _mm_shuffle_epi8(row4a, r16); \
    row3a = _mm_add_epi64( row3a, row4a );					\
    row2a = _mm_xor_si128( row2a, row3a );					\
    row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 11 ),_mm_slli_epi64( row2a, 53 )); \
\
    /* same stuff for low-order side */\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][14]], (__m64)m[sig[r][12]] );\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][15]], (__m64)z[sig[r][13]] );\
    buf1a = _mm_xor_si128( buf1a, buf2a );\
    row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b );\
    row4b = _mm_xor_si128( row4b, row1b );			\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][15]], (__m64)m[sig[r][13]] );\
    row4b = _mm_shuffle_epi32(row4b, 0xB1); \
    row3b = _mm_add_epi64( row3b, row4b );					\
    row2b = _mm_xor_si128( row2b, row3b );					\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][14]], (__m64)z[sig[r][12]] );\
    row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 25 ),_mm_slli_epi64( row2b, 39 )); \
\
    buf1a = _mm_xor_si128( buf1a, buf2a );\
    row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b );\
    row4b = _mm_xor_si128( row4b, row1b );			\
    row4b = _mm_shuffle_epi8(row4b, r16); \
    row3b = _mm_add_epi64( row3b, row4b );					\
    row2b = _mm_xor_si128( row2b, row3b );					\
    row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 11 ),_mm_slli_epi64( row2b, 53 )); \
\
    /* shuffle back */\
    buf1a = row3a;\
    row3a = row3b;\
    row3b = buf1a;\
    _mm_store_si128( 0+ (__m128i *)y, row2a);	\
    _mm_store_si128( 1+ (__m128i *)y, row2b);  \
    row2a  = _mm_set_epi64( (__m64)y[0], (__m64)y[3] );  \
    row2b  = _mm_set_epi64( (__m64)y[2], (__m64)y[1] );  \
    _mm_store_si128( 0+ (__m128i *)y, row4a);  \
    _mm_store_si128( 1+ (__m128i *)y, row4b);  \
    row4a  = _mm_set_epi64( (__m64)y[2], (__m64)y[1] );  \
    row4b  = _mm_set_epi64( (__m64)y[0], (__m64)y[3] );  \
    							 \

  round(0);
  round(1);
  round(2);
  round(3);
  round(4);
  round(5);
  round(6);
  round(7);
  round(8);
  round(9);
  round(10);
  round(11);
  round(12);
  round(13);

  row1a = _mm_xor_si128(row3a,row1a);
  row1b = _mm_xor_si128(row3b,row1b);
  _mm_store_si128(  (__m128i *)m, row1a);
  state->h[0] ^= m[ 0]; 
  state->h[1] ^= m[ 1];    
  _mm_store_si128(  (__m128i *)m, row1b);
  state->h[2] ^= m[ 0]; 
  state->h[3] ^= m[ 1];    

  row2a = _mm_xor_si128(row4a,row2a);
  row2b = _mm_xor_si128(row4b,row2b);
  _mm_store_si128(  (__m128i *)m, row2a);
  state->h[4] ^= m[ 0];    
  state->h[5] ^= m[ 1];    
  _mm_store_si128(  (__m128i *)m, row2b);
  state->h[6] ^= m[ 0];    
  state->h[7] ^= m[ 1];    

  return 0;
}
Example #10
0
void GetMinMaxColors_Intrinsics( const byte *colorBlock, byte *minColor, byte *maxColor )
{
    __m128i t0, t1, t3, t4, t6, t7;

    // get bounding box
    // ----------------

    // load the first row
    t0 = _mm_load_si128 ( (__m128i*) colorBlock );
    t1 = _mm_load_si128 ( (__m128i*) colorBlock );

    __m128i t16 = _mm_load_si128 ( (__m128i*) (colorBlock+16) );
    // Minimum of Packed Unsigned Byte Integers
    t0 = _mm_min_epu8 ( t0, t16);
    // Maximum of Packed Unsigned Byte Integers
    t1 = _mm_max_epu8 ( t1, t16);

    __m128i t32 = _mm_load_si128 ( (__m128i*) (colorBlock+32) );
    t0 = _mm_min_epu8 ( t0, t32);
    t1 = _mm_max_epu8 ( t1, t32);

    __m128i t48 = _mm_load_si128 ( (__m128i*) (colorBlock+48) );
    t0 = _mm_min_epu8 ( t0, t48);
    t1 = _mm_max_epu8 ( t1, t48);

    // Shuffle Packed Doublewords
    t3 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) );
    t4 = _mm_shuffle_epi32( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) );

    t0 = _mm_min_epu8 ( t0, t3);
    t1 = _mm_max_epu8 ( t1, t4);

    // Shuffle Packed Low Words
    t6 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) );
    t7 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) );

    t0 = _mm_min_epu8 ( t0, t6);
    t1 = _mm_max_epu8 ( t1, t7);

    // inset the bounding box
    // ----------------------

    // Unpack Low Data
    //__m128i t66 = _mm_set1_epi8( 0 );
    __m128i t66 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_0 );
    t0 = _mm_unpacklo_epi8(t0, t66);
    t1 = _mm_unpacklo_epi8(t1, t66);

    // copy (movdqa)
    //__m128i t2 = _mm_load_si128 ( &t1 );
    __m128i t2 = t1;

    // Subtract Packed Integers
    t2 = _mm_sub_epi16(t2, t0);

    // Shift Packed Data Right Logical
    t2 = _mm_srli_epi16(t2, INSET_SHIFT);

    // Add Packed Integers
    t0 = _mm_add_epi16(t0, t2);

    t1 = _mm_sub_epi16(t1, t2);

    // Pack with Unsigned Saturation
    t0 = _mm_packus_epi16(t0, t0);
    t1 = _mm_packus_epi16(t1, t1);

    // store bounding box extents
    // --------------------------
    _mm_store_si128 ( (__m128i*) minColor, t0 );
    _mm_store_si128 ( (__m128i*) maxColor, t1 );
}
static inline void
ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq)
{
	int i;
	uint16_t rx_id;
	volatile union ixgbe_adv_rx_desc *rxdp;
	struct ixgbe_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
	struct rte_mbuf *mb0, *mb1;
	__m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
			RTE_PKTMBUF_HEADROOM);
	__m128i dma_addr0, dma_addr1;

	const __m128i hba_msk = _mm_set_epi64x(0, UINT64_MAX);

	rxdp = rxq->rx_ring + rxq->rxrearm_start;

	/* Pull 'n' more MBUFs into the software ring */
	if (rte_mempool_get_bulk(rxq->mb_pool,
				 (void *)rxep,
				 RTE_IXGBE_RXQ_REARM_THRESH) < 0) {
		if (rxq->rxrearm_nb + RTE_IXGBE_RXQ_REARM_THRESH >=
		    rxq->nb_rx_desc) {
			dma_addr0 = _mm_setzero_si128();
			for (i = 0; i < RTE_IXGBE_DESCS_PER_LOOP; i++) {
				rxep[i].mbuf = &rxq->fake_mbuf;
				_mm_store_si128((__m128i *)&rxdp[i].read,
						dma_addr0);
			}
		}
		rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
			RTE_IXGBE_RXQ_REARM_THRESH;
		return;
	}

	/* Initialize the mbufs in vector, process 2 mbufs in one loop */
	for (i = 0; i < RTE_IXGBE_RXQ_REARM_THRESH; i += 2, rxep += 2) {
		__m128i vaddr0, vaddr1;
		uintptr_t p0, p1;

		mb0 = rxep[0].mbuf;
		mb1 = rxep[1].mbuf;

		/*
		 * Flush mbuf with pkt template.
		 * Data to be rearmed is 6 bytes long.
		 * Though, RX will overwrite ol_flags that are coming next
		 * anyway. So overwrite whole 8 bytes with one load:
		 * 6 bytes of rearm_data plus first 2 bytes of ol_flags.
		 */
		p0 = (uintptr_t)&mb0->rearm_data;
		*(uint64_t *)p0 = rxq->mbuf_initializer;
		p1 = (uintptr_t)&mb1->rearm_data;
		*(uint64_t *)p1 = rxq->mbuf_initializer;

		/* load buf_addr(lo 64bit) and buf_physaddr(hi 64bit) */
		vaddr0 = _mm_loadu_si128((__m128i *)&(mb0->buf_addr));
		vaddr1 = _mm_loadu_si128((__m128i *)&(mb1->buf_addr));

		/* convert pa to dma_addr hdr/data */
		dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
		dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);

		/* add headroom to pa values */
		dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
		dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);

		/* set Header Buffer Address to zero */
		dma_addr0 =  _mm_and_si128(dma_addr0, hba_msk);
		dma_addr1 =  _mm_and_si128(dma_addr1, hba_msk);

		/* flush desc with pa dma_addr */
		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
	}

	rxq->rxrearm_start += RTE_IXGBE_RXQ_REARM_THRESH;
	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
		rxq->rxrearm_start = 0;

	rxq->rxrearm_nb -= RTE_IXGBE_RXQ_REARM_THRESH;

	rx_id = (uint16_t) ((rxq->rxrearm_start == 0) ?
			     (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));

	/* Update the tail pointer on the NIC */
	IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
}
Example #12
0
void EmitColorIndices_Intrinsics( const byte *colorBlock, const byte *minColor, const byte *maxColor, byte *&outData )
{
	ALIGN16( byte color0[16] );
	ALIGN16( byte color1[16] );
	ALIGN16( byte color2[16] );
	ALIGN16( byte color3[16] );
	ALIGN16( byte result[16] );

	// mov esi, maxColor
	// mov edi, minColor

	__m128i t0, t1, t2, t3, t4, t5, t6, t7;

	t7 = _mm_setzero_si128();
	//t7 = _mm_xor_si128(t7, t7);
	_mm_store_si128 ( (__m128i*) &result, t7 );


	//t0 = _mm_load_si128 ( (__m128i*)  maxColor );
	t0 = _mm_cvtsi32_si128( *(int*)maxColor);

	// Bitwise AND
	__m128i tt = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_colorMask );
	t0 = _mm_and_si128(t0, tt);

	t0 = _mm_unpacklo_epi8(t0, t7);

	t4 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 0, 3, 2, 3 ));
	t5 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 3, 1, 3, 3 ));

	t4 = _mm_srli_epi16(t4, 5);
	t5 = _mm_srli_epi16(t5, 6);

	// Bitwise Logical OR
	t0 = _mm_or_si128(t0, t4);
	t0 = _mm_or_si128(t0, t5);   // t0 contains color0 in 565




	//t1 = _mm_load_si128 ( (__m128i*)  minColor );
	t1 = _mm_cvtsi32_si128( *(int*)minColor);

	t1 = _mm_and_si128(t1, tt);

	t1 = _mm_unpacklo_epi8(t1, t7);

	t4 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 0, 3, 2, 3 ));
	t5 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 3, 1, 3, 3 ));

	t4 = _mm_srli_epi16(t4, 5);
	t5 = _mm_srli_epi16(t5, 6);

	t1 = _mm_or_si128(t1, t4);
	t1 = _mm_or_si128(t1, t5);  // t1 contains color1 in 565



	t2 = t0;

	t2 = _mm_packus_epi16(t2, t7);

	t2 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 0, 1, 0, 1 ));

	_mm_store_si128 ( (__m128i*) &color0, t2 );

	t6 = t0;
	t6 = _mm_add_epi16(t6, t0);
	t6 = _mm_add_epi16(t6, t1);

	// Multiply Packed Signed Integers and Store High Result
	__m128i tw3 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_div_by_3 );
	t6 = _mm_mulhi_epi16(t6, tw3);
	t6 = _mm_packus_epi16(t6, t7);

	t6 = _mm_shuffle_epi32( t6, R_SHUFFLE_D( 0, 1, 0, 1 ));

	_mm_store_si128 ( (__m128i*) &color2, t6 );

	t3 = t1;
	t3 = _mm_packus_epi16(t3, t7);
	t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 1, 0, 1 ));

	_mm_store_si128 ( (__m128i*) &color1, t3 );

	t1 = _mm_add_epi16(t1, t1);
	t0 = _mm_add_epi16(t0, t1);

	t0 = _mm_mulhi_epi16(t0, tw3);
	t0 = _mm_packus_epi16(t0, t7);

	t0 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 0, 1, 0, 1 ));
	_mm_store_si128 ( (__m128i*) &color3, t0 );

	__m128i w0 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_0);
	__m128i w1 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_1);
	__m128i w2 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_2);

	    // mov eax, 32
	    // mov esi, colorBlock
	int x = 32;
	//const byte *c = colorBlock;
	while (x >= 0)
	  {
	    t3 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+0));
	    t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 2, 1, 3 ));

	    t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+8));
	    t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 ));

	    t0 = t3;
	    t6 = t5;
	    // Compute Sum of Absolute Difference
	    __m128i c0 = _mm_load_si128 ( (__m128i*)  color0 );
	    t0 = _mm_sad_epu8(t0, c0);
	    t6 = _mm_sad_epu8(t6, c0);
	    // Pack with Signed Saturation
	    t0 = _mm_packs_epi32 (t0, t6);

	    t1 = t3;
	    t6 = t5;
	    __m128i c1 = _mm_load_si128 ( (__m128i*)  color1 );
	    t1 = _mm_sad_epu8(t1, c1);
	    t6 = _mm_sad_epu8(t6, c1);
	    t1 = _mm_packs_epi32 (t1, t6);

	    t2 = t3;
	    t6 = t5;
	    __m128i c2 = _mm_load_si128 ( (__m128i*)  color2 );
	    t2 = _mm_sad_epu8(t2, c2);
	    t6 = _mm_sad_epu8(t6, c2);
	    t2 = _mm_packs_epi32 (t2, t6);

	    __m128i c3 = _mm_load_si128 ( (__m128i*)  color3 );
	    t3 = _mm_sad_epu8(t3, c3);
	    t5 = _mm_sad_epu8(t5, c3);
	    t3 = _mm_packs_epi32 (t3, t5);


	    t4 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+16));
	    t4 = _mm_shuffle_epi32( t4, R_SHUFFLE_D( 0, 2, 1, 3 ));

	    t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+24));
	    t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 ));

	    t6 = t4;
	    t7 = t5;
	    t6 = _mm_sad_epu8(t6, c0);
	    t7 = _mm_sad_epu8(t7, c0);
	    t6 = _mm_packs_epi32 (t6, t7);
	    t0 = _mm_packs_epi32 (t0, t6);  // d0

	    t6 = t4;
	    t7 = t5;
	    t6 = _mm_sad_epu8(t6, c1);
	    t7 = _mm_sad_epu8(t7, c1);
	    t6 = _mm_packs_epi32 (t6, t7);
	    t1 = _mm_packs_epi32 (t1, t6);  // d1

	    t6 = t4;
	    t7 = t5;
	    t6 = _mm_sad_epu8(t6, c2);
	    t7 = _mm_sad_epu8(t7, c2);
	    t6 = _mm_packs_epi32 (t6, t7);
	    t2 = _mm_packs_epi32 (t2, t6);  // d2

	    t4 = _mm_sad_epu8(t4, c3);
	    t5 = _mm_sad_epu8(t5, c3);
	    t4 = _mm_packs_epi32 (t4, t5);
	    t3 = _mm_packs_epi32 (t3, t4);  // d3

	    t7 = _mm_load_si128 ( (__m128i*) result );

	    t7 = _mm_slli_epi32( t7, 16);

	    t4 = t0;
	    t5 = t1;
	    // Compare Packed Signed Integers for Greater Than
	    t0 = _mm_cmpgt_epi16(t0, t3); // b0
	    t1 = _mm_cmpgt_epi16(t1, t2); // b1
	    t4 = _mm_cmpgt_epi16(t4, t2); // b2
	    t5 = _mm_cmpgt_epi16(t5, t3); // b3
	    t2 = _mm_cmpgt_epi16(t2, t3); // b4

	    t4 = _mm_and_si128(t4, t1); // x0
	    t5 = _mm_and_si128(t5, t0); // x1
	    t2 = _mm_and_si128(t2, t0); // x2

	    t4 = _mm_or_si128(t4, t5);
	    t2 = _mm_and_si128(t2, w1);
	    t4 = _mm_and_si128(t4, w2);
	    t2 = _mm_or_si128(t2, t4);

	    t5 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 2, 3, 0, 1 ));

	    // Unpack Low Data
	    t2 = _mm_unpacklo_epi16 ( t2, w0);
	    t5 = _mm_unpacklo_epi16 ( t5, w0);

	    //t5 = _mm_slli_si128 ( t5, 8);
	    t5 = _mm_slli_epi32( t5, 8);

	    t7 = _mm_or_si128(t7, t5);
	    t7 = _mm_or_si128(t7, t2);

	    _mm_store_si128 ( (__m128i*) &result, t7 );

	    x -=32;
	  }

	t4 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 1, 2, 3, 0 ));
	t5 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 2, 3, 0, 1 ));
	t6 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 3, 0, 1, 2 ));

	t4 = _mm_slli_epi32 ( t4, 2);
	t5 = _mm_slli_epi32 ( t5, 4);
	t6 = _mm_slli_epi32 ( t6, 6);

	t7 = _mm_or_si128(t7, t4);
	t7 = _mm_or_si128(t7, t5);
	t7 = _mm_or_si128(t7, t6);

	//_mm_store_si128 ( (__m128i*) outData, t7 );

	int r = _mm_cvtsi128_si32 (t7);
	memcpy(outData, &r, 4);   // Anything better ?

	outData += 4;
}
Example #13
0
 ScoreKeyValue& operator=(const ScoreKeyValue& other) {
   _mm_store_si128(&as_m128i, other.as_m128i);
   return *this;
 }
Example #14
0
 ScoreKeyValue(const ScoreKeyValue& other) {
   static_assert(sizeof(ScoreKeyValue) == sizeof(__m128i),
                 "sizeof(ScoreKeyValue) should be equal to sizeof(__m128i)");
   _mm_store_si128(&as_m128i, other.as_m128i);
 }
Example #15
0
static inline int blake512_compress( state * state, const u8 * datablock ) 
{

  __m128i row1l;
  __m128i row2l;
  __m128i row3l;
  __m128i row4l;
  u64 row1hl, row1hh;
  u64 row2hl, row2hh;
  u64 row3hl, row3hh;
  u64 row4hl, row4hh;

  const __m128i r16 = _mm_setr_epi8(2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9);
  const __m128i u8to64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);

  union
  {
    __m128i u128[8];
    u64     u64[16];
  } m;

  __m128i t0, t1, t2, t3, t4, t5, t6, t7;
  u64     u0, u1, u2, u3;
  __m128i b0;
  u64 b1l, b1h;

  m.u128[0] = _mm_loadu_si128((__m128i*)(datablock +   0));
  m.u128[1] = _mm_loadu_si128((__m128i*)(datablock +  16));
  m.u128[2] = _mm_loadu_si128((__m128i*)(datablock +  32));
  m.u128[3] = _mm_loadu_si128((__m128i*)(datablock +  48));
  m.u128[4] = _mm_loadu_si128((__m128i*)(datablock +  64));
  m.u128[5] = _mm_loadu_si128((__m128i*)(datablock +  80));
  m.u128[6] = _mm_loadu_si128((__m128i*)(datablock +  96));
  m.u128[7] = _mm_loadu_si128((__m128i*)(datablock + 112));

  m.u128[0] = BSWAP64(m.u128[0]);
  m.u128[1] = BSWAP64(m.u128[1]);
  m.u128[2] = BSWAP64(m.u128[2]);
  m.u128[3] = BSWAP64(m.u128[3]);
  m.u128[4] = BSWAP64(m.u128[4]);
  m.u128[5] = BSWAP64(m.u128[5]);
  m.u128[6] = BSWAP64(m.u128[6]);
  m.u128[7] = BSWAP64(m.u128[7]);

  row1l = _mm_load_si128((__m128i*)&state->h[0]);
  row1hl = state->h[2];
  row1hh = state->h[3];

  row2l = _mm_load_si128((__m128i*)&state->h[4]);
  row2hl = state->h[6];
  row2hh = state->h[7];

  row3l = _mm_set_epi64x(0x13198A2E03707344ULL, 0x243F6A8885A308D3ULL);
  row3hl = 0xA4093822299F31D0ULL;
  row3hh = 0x082EFA98EC4E6C89ULL;

  row4l = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x452821E638D01377ULL);
  row4hl = 0xC0AC29B7C97C50DDULL;
  row4hh = 0x3F84D5B5B5470917ULL;

  if(!state->nullt)
  {
  	row4l = _mm_xor_si128(row4l, _mm_set1_epi64x(state->t[0]));
    row4hl ^= state->t[1];
    row4hh ^= state->t[1];
  }

  ROUND( 0);
  ROUND( 1);
  ROUND( 2);
  ROUND( 3);
  ROUND( 4);
  ROUND( 5);
  ROUND( 6);
  ROUND( 7);
  ROUND( 8);
  ROUND( 9);
  ROUND(10);
  ROUND(11);
  ROUND(12);
  ROUND(13);
  ROUND(14);
  ROUND(15);

  row1l = _mm_xor_si128(row3l,row1l);
  row1hl ^= row3hl;
  row1hh ^= row3hh;

  _mm_store_si128((__m128i*)&state->h[0], _mm_xor_si128(row1l, _mm_load_si128((__m128i*)&state->h[0])));
  state->h[2] ^= row1hl;
  state->h[3] ^= row1hh;

  row2l = _mm_xor_si128(row4l,row2l);
  row2hl ^= row4hl;
  row2hh ^= row4hh;

  _mm_store_si128((__m128i*)&state->h[4], _mm_xor_si128(row2l, _mm_load_si128((__m128i*)&state->h[4])));
  state->h[6] ^= row2hl;
  state->h[7] ^= row2hh;
  
  return 0;
}
Example #16
0
void FileIconDrawGlass::Text(HDC hdc, PCTCHAR pcszText, const RECT &rc, eTextColor eColor, UINT uFlags)
{
	if (!pcszText || !*pcszText) return;

	// Find out actual size of text
	int nChars = _tcslen(pcszText);
	uFlags |= DT_NOCLIP;

	int iX = rc.left;
	int iY = rc.top;
	int iXW = (rc.right - iX);
	int iYH = (rc.bottom - iY);

	RECT rcMin = rc;
	if (DrawText(hdcTextDIB, pcszText, nChars, &rcMin, uFlags | DT_CALCRECT)) {
		int iMinXW = rcMin.right - rcMin.left;
		int iMinYH = rcMin.bottom - rcMin.top;
		if (iMinXW < iXW) {
			if (uFlags & DT_CENTER) {
				iX += (iXW - iMinXW)/2;
				uFlags &= ~DT_CENTER;
			} else if (uFlags & DT_RIGHT) {
				iX += (iXW - iMinXW);
				uFlags &= ~DT_RIGHT;
			}
			iXW = iMinXW;
		}
		if (iMinYH < iYH) {
			if (uFlags & DT_SINGLELINE) {
				if (uFlags & DT_VCENTER) {
					iY += (iYH - iMinYH)/2;
					uFlags &= ~DT_VCENTER;
				} else if (uFlags & DT_BOTTOM) {
					iY += (iYH - iMinYH);
					uFlags &= ~DT_BOTTOM;
				}
			}
			iYH = iMinYH;
		}
	}

	iXW += 2;	// NB: +2 'cause we want an extra pixel at the border so that the font smoothing will look bette!
	iYH += 2;

	// Ensure we have a big enough DIB to draw the text to
	if ((iXW > iTextDIBXW) || (iYH > iTextDIBYH)) CreateTextDIB(iXW, iYH);
	if (!hbmpTextDIB) return;

	// Select color
	ieBGRA clr;
	switch (eColor) {
	case eFileName:	clr = clrFileName;		break;
	case eComment:	clr = clrComment;		break;
	case eFileInfo:	clr = clrFileInfo;		break;
	default:		clr = ieBGRA(0,0,0);	break;
	}
	clr.A = 0xFF - clrBkg.A;

	// Draw the text to in-memory DIB
	RECT rcTextDIB = { 0, 0, iXW, iYH };
	FillRect(hdcTextDIB, &rcTextDIB, hbrBkg);

	rcTextDIB.left++;
	rcTextDIB.top++;

	DrawText(hdcTextDIB, pcszText, nChars, &rcTextDIB, uFlags);

	// Modify DIB:
#ifndef __X64__
	if (g_bSSE2) 
#endif
	{
		__m128i r0, r1, r2, r3, r4, r5, r6, r7;

		r7 = _mm_setzero_si128();									// 0
		r6 = _mm_set1_epi32(clr.dw);								// CA  CR  CG  CB  CA  CR  CG  CB  CA  CR  CG  CB  CA  CR  CG  CB
		r6 = _mm_unpacklo_epi8(r7, r6);								// CA<<8   CR<<8   CG<<8   CB<<8   CA<<8   CR<<8   CG<<8   CB<<8
		r5 = _mm_set1_epi16(1);										// 1       1       1       1       1       1       1       1
		r4 = _mm_set1_epi32(0xFF);									// FF              FF              FF              FF
		r3 = _mm_set1_epi32(clrBkg.dw);								// DA  0   0   0   DA  0   0   0   DA  0   0   0   DA  0   0   0

		ieBGRA *py = pTextDIB;
		for (int y = iYH; y--; py += iTextDIBXW) {
			ieBGRA *px = py;

			for (int x_4 = (iXW+3)>>2; x_4--; px += 4) {

				r0 = _mm_load_si128((__m128i *)px);
				r1 = r0;
				r2 = r0;											// X3  R3  G3  B3  X2  R2  G2  B2  X1  R1  G1  B1  X0  R0  G0  B0 
				r0 = _mm_srli_epi32(r0, 16);						// 0   0   X3  R3  0   0   X2  R2  0   0   X1  R1  0   0   X0  R0 
				r1 = _mm_srli_epi32(r1, 8);							// 0   X3  R3  G3  0   X2  R2  G2  0   X1  R1  G1  0   X0  R0  G0 
				r0 = _mm_max_epu8(r0, r2);
				r0 = _mm_max_epu8(r0, r1);							// x   x   x   A3  x   x   x   A2  x   x   x   A1  x   x   x   A0
				r0 = _mm_and_si128(r0, r4);							// 0       A3      0       A2      0       A1      0       A0
				r0 = _mm_shufflelo_epi16(r0, _MM_SHUFFLE(2,2,0,0));
				r0 = _mm_shufflehi_epi16(r0, _MM_SHUFFLE(2,2,0,0));	// A3      A3      A2      A2      A1      A1      A0      A0
				r1 = r0;
				r0 = _mm_unpacklo_epi32(r0, r0);					// A1      A1      A1      A1      A0      A0      A0      A0
				r1 = _mm_unpackhi_epi32(r1, r1);					// A3      A3      A3      A3      A2      A2      A2      A2
				r0 = _mm_add_epi16(r0, r5);							// A1'     A1'     A1'     A1'     A0'     A0'     A0'     A0' 
				r1 = _mm_add_epi16(r1, r5);							// A3'     A3'     A3'     A3'     A2'     A2'     A2'     A2' 
				r0 = _mm_mulhi_epu16(r0, r6);						// xA1"    xR1     xG1     xB1     xA0"    xR0     xG0     xB0
				r1 = _mm_mulhi_epu16(r1, r6);						// xA3"    xR3     xG3     xB3     xA2"    xR2     xG2     xB2
				r0 = _mm_packus_epi16(r0, r1);						// xA3"xR3 xG3 xB3 xA2"xR2 xG2 xB2 xA1"xR1 xG1 xB1 xA0"xR0 xG0 xB0
				r0 = _mm_adds_epu8(r0, r3);							// xA3 xR3 xG3 xB3 xA2 xR2 xG2 xB2 xA1 xR1 xG1 xB1 xA0 xR0 xG0 xB0
				_mm_store_si128((__m128i *)px, r0);
			}
		}
	}
#ifndef __X64__
	else {
Example #17
0
/* Deinterleaves the 3 streams from the input (systematic and 2 parity bits) into 
 * 3 buffers ready to be used by compute_gamma() 
 */
void deinterleave_input(srslte_tdec_sse_t *h, int16_t *input, uint32_t long_cb) {
  uint32_t i;
 
  __m128i *inputPtr = (__m128i*) input; 
  __m128i in0, in1, in2;
  __m128i s0, s1, s2, s;
  __m128i p00, p01, p02, p0;
  __m128i p10, p11, p12, p1;
  
  __m128i *sysPtr = (__m128i*) h->syst; 
  __m128i *pa0Ptr = (__m128i*) h->parity0; 
  __m128i *pa1Ptr = (__m128i*) h->parity1; 
  
  // pick bits 0, 3, 6 from 1st word
  __m128i s0_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0);
  // pick bits 1, 4, 7 from 2st word
  __m128i s1_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff);
  // pick bits 2, 5 from 3rd word
  __m128i s2_mask = _mm_set_epi8(11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);

  // pick bits 1, 4, 7 from 1st word
  __m128i p00_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,15,14,9,8,3,2);
  // pick bits 2, 5, from 2st word
  __m128i p01_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff);
  // pick bits 0, 3, 6 from 3rd word
  __m128i p02_mask = _mm_set_epi8(13,12,7,6,1,0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
  
  // pick bits 2, 5 from 1st word
  __m128i p10_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4);
  // pick bits 0, 3, 6, from 2st word
  __m128i p11_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0,0xff,0xff,0xff,0xff);
  // pick bits 1, 4, 7 from 3rd word
  __m128i p12_mask = _mm_set_epi8(15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
      
  // Split systematic and parity bits
  for (i = 0; i < long_cb/8; i++) {
        
    in0 = _mm_load_si128(inputPtr); inputPtr++; 
    in1 = _mm_load_si128(inputPtr); inputPtr++;   
    in2 = _mm_load_si128(inputPtr); inputPtr++;
    
    /* Deinterleave Systematic bits */
    s0 = _mm_shuffle_epi8(in0, s0_mask);
    s1 = _mm_shuffle_epi8(in1, s1_mask);
    s2 = _mm_shuffle_epi8(in2, s2_mask);    
    s = _mm_or_si128(s0, s1);
    s = _mm_or_si128(s, s2);

    _mm_store_si128(sysPtr, s);
    sysPtr++;

    /* Deinterleave parity 0 bits */
    p00 = _mm_shuffle_epi8(in0, p00_mask);
    p01 = _mm_shuffle_epi8(in1, p01_mask);
    p02 = _mm_shuffle_epi8(in2, p02_mask);    
    p0 = _mm_or_si128(p00, p01);
    p0 = _mm_or_si128(p0, p02);
    
    _mm_store_si128(pa0Ptr, p0);
    pa0Ptr++;

    /* Deinterleave parity 1 bits */
    p10 = _mm_shuffle_epi8(in0, p10_mask);
    p11 = _mm_shuffle_epi8(in1, p11_mask);
    p12 = _mm_shuffle_epi8(in2, p12_mask);    
    p1 = _mm_or_si128(p10, p11);
    p1 = _mm_or_si128(p1, p12);

    _mm_store_si128(pa1Ptr, p1);
    pa1Ptr++;    

  }
  
  for (i = 0; i < 3; i++) {
    h->syst[i+long_cb]    = input[3*long_cb + 2*i];
    h->parity0[i+long_cb] = input[3*long_cb + 2*i + 1];
  }
  for (i = 0; i < 3; i++) {
    h->app2[i+long_cb]    = input[3*long_cb + 6 + 2*i];
    h->parity1[i+long_cb] = input[3*long_cb + 6 + 2*i + 1];
  }

}
Example #18
0
static void build_integral_sse2(uint32_t *integral,
                                int       integral_stride,
                          const uint8_t  *src,
                          const uint8_t  *src_pre,
                          const uint8_t  *compare,
                          const uint8_t  *compare_pre,
                                int       w,
                                int       border,
                                int       dst_w,
                                int       dst_h,
                                int       dx,
                                int       dy)
{
    const __m128i zero = _mm_set1_epi8(0);
    const int bw = w + 2 * border;

    for (int y = 0; y < dst_h; y++)
    {
        __m128i prevadd = _mm_set1_epi32(0);

        const uint8_t *p1 = src_pre + y*bw;
        const uint8_t *p2 = compare_pre + (y+dy)*bw + dx;
        uint32_t *out = integral + (y*integral_stride);

        for (int x = 0; x < dst_w; x += 16)
        {
            __m128i pa, pb;
            __m128i pla, plb;
            __m128i ldiff, lldiff, lhdiff;
            __m128i ltmp,htmp;
            __m128i ladd,hadd;
            __m128i pha,phb;
            __m128i hdiff,hldiff,hhdiff;
            __m128i l2tmp,h2tmp;

            pa = _mm_loadu_si128((__m128i*)p1);      // Load source  pixels into register 1
            pb = _mm_loadu_si128((__m128i*)p2);      // Load compare pixels into register 2

            // Low
            pla = _mm_unpacklo_epi8(pa,zero);        // Unpack and interleave source  low with zeros
            plb = _mm_unpacklo_epi8(pb,zero);        // Unpack and interleave compare low with zeros

            ldiff = _mm_sub_epi16(pla,plb);          // Diff source and compare lows (subtract)
            ldiff = _mm_mullo_epi16(ldiff,ldiff);    // Square low diff (multiply at 32-bit precision)

            lldiff = _mm_unpacklo_epi16(ldiff,zero); // Unpack and interleave diff low  with zeros
            lhdiff = _mm_unpackhi_epi16(ldiff,zero); // Unpack and interleave diff high with zeros

            ltmp = _mm_slli_si128(lldiff, 4);        // Temp shift diff low left 4 bytes
            lldiff = _mm_add_epi32(lldiff, ltmp);    // Add above to diff low
            ltmp = _mm_slli_si128(lldiff, 8);        // Temp shift diff low left 8 bytes
            lldiff = _mm_add_epi32(lldiff, ltmp);    // Add above to diff low
            lldiff = _mm_add_epi32(lldiff, prevadd); // Add previous total to diff low

            ladd = _mm_shuffle_epi32(lldiff, 0xff);  // Shuffle diff low

            htmp = _mm_slli_si128(lhdiff, 4);        // Temp shift diff high left 4 bytes
            lhdiff = _mm_add_epi32(lhdiff, htmp);    // Add above to diff high
            htmp = _mm_slli_si128(lhdiff, 8);        // Temp shift diff high left 8 bytes
            lhdiff = _mm_add_epi32(lhdiff, htmp);    // Add above to diff high
            lhdiff = _mm_add_epi32(lhdiff, ladd);    // Add shuffled diff low to diff high

            prevadd = _mm_shuffle_epi32(lhdiff, 0xff); // Shuffle diff high

            // High
            pha = _mm_unpackhi_epi8(pa,zero);        // Unpack and interleave source  high with zeros
            phb = _mm_unpackhi_epi8(pb,zero);        // Unpack and interleave compare high with zeros

            hdiff = _mm_sub_epi16(pha,phb);          // Diff source and compare highs (subtract)
            hdiff = _mm_mullo_epi16(hdiff,hdiff);    // Square high diff (multiply at 32-bit precision)

            hldiff = _mm_unpacklo_epi16(hdiff,zero); // Unpack and interleave diff low  with zeros
            hhdiff = _mm_unpackhi_epi16(hdiff,zero); // Unpack and interleave diff high with zeros

            l2tmp = _mm_slli_si128(hldiff, 4);       // Temp shift diff low 4 bytes
            hldiff = _mm_add_epi32(hldiff, l2tmp);   // Add above to diff low
            l2tmp = _mm_slli_si128(hldiff, 8);       // Temp shift diff low left 8 bytes
            hldiff = _mm_add_epi32(hldiff, l2tmp);   // Add above to diff low
            hldiff = _mm_add_epi32(hldiff, prevadd); // Add previous total to diff low

            hadd = _mm_shuffle_epi32(hldiff, 0xff);  // Shuffle diff low

            h2tmp = _mm_slli_si128(hhdiff, 4);       // Temp shift diff high left 4 bytes
            hhdiff = _mm_add_epi32(hhdiff, h2tmp);   // Add above to diff high
            h2tmp = _mm_slli_si128(hhdiff, 8);       // Temp shift diff high left 8 bytes
            hhdiff = _mm_add_epi32(hhdiff, h2tmp);   // Add above to diff high
            hhdiff = _mm_add_epi32(hhdiff, hadd);    // Add shuffled diff low to diff high

            prevadd = _mm_shuffle_epi32(hhdiff, 0xff); // Shuffle diff high

            // Store
            _mm_store_si128((__m128i*)(out),    lldiff); // Store low  diff low  in memory
            _mm_store_si128((__m128i*)(out+4),  lhdiff); // Store low  diff high in memory
            _mm_store_si128((__m128i*)(out+8),  hldiff); // Store high diff low  in memory
            _mm_store_si128((__m128i*)(out+12), hhdiff); // Store high diff high in memory

            // Increment
            out += 16;
            p1  += 16;
            p2  += 16;
        }

        if (y > 0)
        {
            out = integral + y*integral_stride;

            for (int x = 0; x < dst_w; x += 16)
            {
                *((__m128i*)out) = _mm_add_epi32(*(__m128i*)(out-integral_stride),
                                                 *(__m128i*)(out));

                *((__m128i*)(out+4)) = _mm_add_epi32(*(__m128i*)(out+4-integral_stride),
                                                     *(__m128i*)(out+4));

                *((__m128i*)(out+8)) = _mm_add_epi32(*(__m128i*)(out+8-integral_stride),
                                                     *(__m128i*)(out+8));

                *((__m128i*)(out+12)) = _mm_add_epi32(*(__m128i*)(out+12-integral_stride),
                                                      *(__m128i*)(out+12));

                out += 16;
            }
        }
    }
}
Example #19
0
pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
		BYTE *pDst, int dstStep, const prim_size_t *roi)
{
	int lastRow, lastCol;
	BYTE *UData,*VData,*YData;
	int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV;
	__m128i r0,r1,r2,r3,r4,r5,r6,r7;
	__m128i *buffer;
	
	/* last_line: if the last (U,V doubled) line should be skipped, set to 10B
	 * last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */

	buffer = _aligned_malloc(4 * 16, 16);
	
	YData = (BYTE*) pSrc[0];
	UData = (BYTE*) pSrc[1];
	VData = (BYTE*) pSrc[2];
	
	nWidth = roi->width;
	nHeight = roi->height;
	
	if ((lastCol = (nWidth & 3)))
	{
		switch (lastCol)
		{
			case 1:
				r7 = _mm_set_epi32(0,0,0,0xFFFFFFFF);
				break;

			case 2:
				r7 = _mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF);
				break;

			case 3:
				r7 = _mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF);
				break;
		}

		_mm_store_si128(buffer+3,r7);
		lastCol = 1;
	}
	
	nWidth += 3;
	nWidth = nWidth >> 2;
	
	lastRow = nHeight & 1;
	nHeight++;
	nHeight = nHeight >> 1;
	
	VaddDst = (dstStep << 1) - (nWidth << 4);
	VaddY = (srcStep[0] << 1) - (nWidth << 2);
	VaddU = srcStep[1] - (((nWidth << 1) + 2) & 0xFFFC);
	VaddV = srcStep[2] - (((nWidth << 1) + 2) & 0xFFFC);
	
	while (nHeight-- > 0)
	{
		if (nHeight == 0)
			lastRow <<= 1;

		i = 0;
		
		do
		{
			if (!(i & 0x01))
			{
			/* Y-, U- and V-data is stored in different arrays.
			* We start with processing U-data.
			*
			* at first we fetch four U-values from its array and shuffle them like this:
			*	0d0d 0c0c 0b0b 0a0a
			* we've done two things: converting the values to signed words and duplicating
			* each value, because always two pixel "share" the same U- (and V-) data */
				r0 = _mm_cvtsi32_si128(*(UINT32 *)UData);
				r5 = _mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000);
				r0 = _mm_shuffle_epi8(r0,r5);
				
				UData += 4;
				
			/* then we subtract 128 from each value, so we get D */
				r3 = _mm_set_epi16(128,128,128,128,128,128,128,128);
				r0 = _mm_subs_epi16(r0,r3);
				
			/* we need to do two things with our D, so let's store it for later use */
				r2 = r0;
				
			/* now we can multiply our D with 48 and unpack it to xmm4:xmm0
			 * this is what we need to get G data later on */
				r4 = r0;
				r7 = _mm_set_epi16(48,48,48,48,48,48,48,48);
				r0 = _mm_mullo_epi16(r0,r7);
				r4 = _mm_mulhi_epi16(r4,r7);
				r7 = r0;
				r0 = _mm_unpacklo_epi16(r0,r4);
				r4 = _mm_unpackhi_epi16(r7,r4);
				
			/* to get B data, we need to prepare a second value, D*475 */
				r1 = r2;
				r7 = _mm_set_epi16(475,475,475,475,475,475,475,475);
				r1 = _mm_mullo_epi16(r1,r7);
				r2 = _mm_mulhi_epi16(r2,r7);
				r7 = r1;
				r1 = _mm_unpacklo_epi16(r1,r2);
				r7 = _mm_unpackhi_epi16(r7,r2);
				
			/* so we got something like this: xmm7:xmm1
			 * this pair contains values for 16 pixel:
			 * aabbccdd
			 * aabbccdd, but we can only work on four pixel at once, so we need to save upper values */
				_mm_store_si128(buffer+1,r7);
				
			/* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */
				r2 = _mm_cvtsi32_si128(*(UINT32 *)VData);
				r2 = _mm_shuffle_epi8(r2,r5);
				
				VData += 4;
				
				r2 = _mm_subs_epi16(r2,r3);
				
				r5 = r2;
				
			/* this is also known as E*403, we need it to convert R data */
				r3 = r2;
				r7 = _mm_set_epi16(403,403,403,403,403,403,403,403);
				r2 = _mm_mullo_epi16(r2,r7);
				r3 = _mm_mulhi_epi16(r3,r7);
				r7 = r2;
				r2 = _mm_unpacklo_epi16(r2,r3);
				r7 = _mm_unpackhi_epi16(r7,r3);
				
			/* and preserve upper four values for future ... */
				_mm_store_si128(buffer+2,r7);
				
			/* doing this step: E*120 */
				r3 = r5;
				r7 = _mm_set_epi16(120,120,120,120,120,120,120,120);
				r3 = _mm_mullo_epi16(r3,r7);
				r5 = _mm_mulhi_epi16(r5,r7);
				r7 = r3;
				r3 = _mm_unpacklo_epi16(r3,r5);
				r7 = _mm_unpackhi_epi16(r7,r5);
				
			/* now we complete what we've begun above:
			 * (48*D) + (120*E) = (48*D +120*E) */
				r0 = _mm_add_epi32(r0,r3);
				r4 = _mm_add_epi32(r4,r7);
				
			/* and store to memory ! */
				_mm_store_si128(buffer,r4);
			}
			else
			{
			/* maybe you've wondered about the conditional above ?
			 * Well, we prepared UV data for eight pixel in each line, but can only process four
			 * per loop. So we need to load the upper four pixel data from memory each secound loop! */
				r1 = _mm_load_si128(buffer+1);
				r2 = _mm_load_si128(buffer+2);
				r0 = _mm_load_si128(buffer);
			}
			
			if (++i == nWidth)
				lastCol <<= 1;
			
		/* We didn't produce any output yet, so let's do so!
		 * Ok, fetch four pixel from the Y-data array and shuffle them like this:
		 * 00d0 00c0 00b0 00a0, to get signed dwords and multiply by 256 */
			r4 = _mm_cvtsi32_si128(*(UINT32 *)YData);
			r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080);
			r4 = _mm_shuffle_epi8(r4,r7);
			
			r5 = r4;
			r6 = r4;
			
		/* no we can perform the "real" conversion itself and produce output! */
			r4 = _mm_add_epi32(r4,r2);
			r5 = _mm_sub_epi32(r5,r0);
			r6 = _mm_add_epi32(r6,r1);
			
		/* in the end, we only need bytes for RGB values.
		 * So, what do we do? right! shifting left makes values bigger and thats always good.
		 * before we had dwords of data, and by shifting left and treating the result
		 * as packed words, we get not only signed words, but do also divide by 256
		 * imagine, data is now ordered this way: ddx0 ccx0 bbx0 aax0, and x is the least
		 * significant byte, that we don't need anymore, because we've done some rounding */
			r4 = _mm_slli_epi32(r4,8);
			r5 = _mm_slli_epi32(r5,8);
			r6 = _mm_slli_epi32(r6,8);
			
		/* one thing we still have to face is the clip() function ...
		 * we have still signed words, and there are those min/max instructions in SSE2 ...
		 * the max instruction takes always the bigger of the two operands and stores it in the first one,
		 * and it operates with signs !
		 * if we feed it with our values and zeros, it takes the zeros if our values are smaller than
		 * zero and otherwise our values */
			r7 = _mm_set_epi32(0,0,0,0);
			r4 = _mm_max_epi16(r4,r7);
			r5 = _mm_max_epi16(r5,r7);
			r6 = _mm_max_epi16(r6,r7);
			
		/* the same thing just completely different can be used to limit our values to 255,
		 * but now using the min instruction and 255s */
			r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
			r4 = _mm_min_epi16(r4,r7);
			r5 = _mm_min_epi16(r5,r7);
			r6 = _mm_min_epi16(r6,r7);
			
		/* Now we got our bytes.
		 * the moment has come to assemble the three channels R,G and B to the xrgb dwords
		 * on Red channel we just have to and each futural dword with 00FF0000H */
			//r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
			r4 = _mm_and_si128(r4,r7);
			
		/* on Green channel we have to shuffle somehow, so we get something like this:
		 * 00d0 00c0 00b0 00a0 */
			r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280);
			r5 = _mm_shuffle_epi8(r5,r7);
			
		/* and on Blue channel that one:
		 * 000d 000c 000b 000a */
			r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002);
			r6 = _mm_shuffle_epi8(r6,r7);
			
		/* and at last we or it together and get this one:
		 * xrgb xrgb xrgb xrgb */
			r4 = _mm_or_si128(r4,r5);
			r4 = _mm_or_si128(r4,r6);
			
		/* Only thing to do know is writing data to memory, but this gets a bit more
		 * complicated if the width is not a multiple of four and it is the last column in line. */
			if (lastCol & 0x02)
			{
			/* let's say, we need to only convert six pixel in width
			 * Ok, the first 4 pixel will be converted just like every 4 pixel else, but
			 * if it's the last loop in line, last_column is shifted left by one (curious? have a look above),
			 * and we land here. Through initialisation a mask was prepared. In this case it looks like
			 * 0000FFFFH 0000FFFFH 0000FFFFH 0000FFFFH */
				r6 = _mm_load_si128(buffer+3);
			/* we and our output data with this mask to get only the valid pixel */
				r4 = _mm_and_si128(r4,r6);
			/* then we fetch memory from the destination array ... */
				r5 = _mm_lddqu_si128((__m128i *)pDst);
			/* ... and and it with the inverse mask. We get only those pixel, which should not be updated */
				r6 = _mm_andnot_si128(r6,r5);
			/* we only have to or the two values together and write it back to the destination array,
			 * and only the pixel that should be updated really get changed. */
				r4 = _mm_or_si128(r4,r6);
			}
			_mm_storeu_si128((__m128i *)pDst,r4);
			
			if (!(lastRow & 0x02))
			{
			/* Because UV data is the same for two lines, we can process the secound line just here,
			 * in the same loop. Only thing we need to do is to add some offsets to the Y- and destination
			 * pointer. These offsets are iStride[0] and the target scanline.
			 * But if we don't need to process the secound line, like if we are in the last line of processing nine lines,
			 * we just skip all this. */
				r4 = _mm_cvtsi32_si128(*(UINT32 *)(YData+srcStep[0]));
				r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080);
				r4 = _mm_shuffle_epi8(r4,r7);
				
				r5 = r4;
				r6 = r4;
				
				r4 = _mm_add_epi32(r4,r2);
				r5 = _mm_sub_epi32(r5,r0);
				r6 = _mm_add_epi32(r6,r1);
				
				r4 = _mm_slli_epi32(r4,8);
				r5 = _mm_slli_epi32(r5,8);
				r6 = _mm_slli_epi32(r6,8);
				
				r7 = _mm_set_epi32(0,0,0,0);
				r4 = _mm_max_epi16(r4,r7);
				r5 = _mm_max_epi16(r5,r7);
				r6 = _mm_max_epi16(r6,r7);
				
				r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
				r4 = _mm_min_epi16(r4,r7);
				r5 = _mm_min_epi16(r5,r7);
				r6 = _mm_min_epi16(r6,r7);
				
				r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
				r4 = _mm_and_si128(r4,r7);
				
				r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280);
				r5 = _mm_shuffle_epi8(r5,r7);
				
				r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002);
				r6 = _mm_shuffle_epi8(r6,r7);
				
				r4 = _mm_or_si128(r4,r5);
				r4 = _mm_or_si128(r4,r6);
				
				if (lastCol & 0x02)
				{
					r6 = _mm_load_si128(buffer+3);
					r4 = _mm_and_si128(r4,r6);
					r5 = _mm_lddqu_si128((__m128i *)(pDst+dstStep));
					r6 = _mm_andnot_si128(r6,r5);
					r4 = _mm_or_si128(r4,r6);
					
				/* only thing is, we should shift [rbp-42] back here, because we have processed the last column,
				 * and this "special condition" can be released */
					lastCol >>= 1;
				}
				_mm_storeu_si128((__m128i *)(pDst+dstStep),r4);
			}
			
		/* after all we have to increase the destination- and Y-data pointer by four pixel */
			pDst += 16;
			YData += 4;
		}
Example #20
0
// @return true iff the two pages differ; false otherwise.
// @note Uses SSE3, so you must compile with -msse3.
bool pagesDifferent (const void * b1, const void * b2) {

  enum { PAGE_SIZE = 4096 };

  // Make a mask, initially all 1's.
  register __m128i mask = _mm_setzero_si128();
  mask = _mm_cmpeq_epi32(mask, mask); 


  __m128i * buf1 = (__m128i *) b1;
  __m128i * buf2 = (__m128i *) b2;

  // Some vectorizing pragamata here; not sure if gcc implements them.

#pragma vector always
  for (int i = 0; i < PAGE_SIZE / sizeof(__m128i); i += 8) {
#pragma ivdep
#pragma vector aligned

    register __m128i xmm1, xmm2;

    // Unrolled loop for speed: we load two 128-bit chunks,
    // and logically AND in their comparison.
    // If the mask gets any zero bits, the bytes differ.
    xmm1 = _mm_load_si128 (&buf1[i]);
    xmm2 = _mm_load_si128 (&buf2[i]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));
    xmm1 = _mm_load_si128 (&buf1[i+1]);
    xmm2 = _mm_load_si128 (&buf2[i+1]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));
    xmm1 = _mm_load_si128 (&buf1[i+2]);
    xmm2 = _mm_load_si128 (&buf2[i+2]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));
    xmm1 = _mm_load_si128 (&buf1[i+3]);
    xmm2 = _mm_load_si128 (&buf2[i+3]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));
    xmm1 = _mm_load_si128 (&buf1[i+4]);
    xmm2 = _mm_load_si128 (&buf2[i+4]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));
    xmm1 = _mm_load_si128 (&buf1[i+5]);
    xmm2 = _mm_load_si128 (&buf2[i+5]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));
    xmm1 = _mm_load_si128 (&buf1[i+6]);
    xmm2 = _mm_load_si128 (&buf2[i+6]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));
    xmm1 = _mm_load_si128 (&buf1[i+7]);
    xmm2 = _mm_load_si128 (&buf2[i+7]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));

    // Save the mask to see whether we have found a difference or not.
    unsigned long long buf[128 / sizeof(unsigned long long) / 8]  __attribute__((aligned(16)));
    _mm_store_si128 ((__m128i *) &buf, mask);
    
    // IMPORTANT: make sure long long = 64bits!
    enum { VERIFY_LONGLONG_64 = 1 / (sizeof(long long) == 8) };

    // Now check the result.
    // Both buf[0] and buf[1] should be all ones.
    if ((buf[0] != (unsigned long long) -1) ||
	(buf[1] != (unsigned long long) -1)) {
      return true;
    }
  }

  // No differences found.
  return false;
}
Example #21
0
int crypto_hash(unsigned char *out,const unsigned char *in,unsigned long long inlen)
{
	hashState state;
	u_int32_t *data32, *data32_end;
	u_int64_t *data64;
	unsigned char *lastPartP, *data8_end;

	#ifdef __x86_64__
	u_int64_t i, iterations, counter, databyteLength;
	#else
	int i, iterations, counter, databyteLength;
	#endif
	// This might be a static check
	if (crypto_hash_BYTES != 32)
		return -1;
	
	databyteLength = inlen; // Want it to be the native data size, and not bigger.

	#ifdef __SSE__
	// Use SSE here, if it is available
	_mm_store_si128((__m128i *) &hashState256_(state).DoublePipe[0], _mm_load_si128((__m128i *) &i256p2[0]));
	_mm_store_si128((__m128i *) &hashState256_(state).DoublePipe[4], _mm_load_si128((__m128i *) &i256p2[4]));
	_mm_store_si128((__m128i *) &hashState256_(state).DoublePipe[8], _mm_load_si128((__m128i *) &i256p2[8]));
	_mm_store_si128((__m128i *) &hashState256_(state).DoublePipe[12], _mm_load_si128((__m128i *) &i256p2[12]));
	#elif defined ( __x86_64__ )
	// Or 64-bit writes if on 64 bit system (not really possible on x86)
	hashState256_(state).DoublePipe[0] = i256p2[0];
	hashState256_(state).DoublePipe[2] = i256p2[2];
	hashState256_(state).DoublePipe[4] = i256p2[4];
	hashState256_(state).DoublePipe[6] = i256p2[6];
	hashState256_(state).DoublePipe[8] = i256p2[8];
	hashState256_(state).DoublePipe[10] = i256p2[10];
	hashState256_(state).DoublePipe[12] = i256p2[12];
	hashState256_(state).DoublePipe[14] = i256p2[14];
	#else
	// Fallback
	memcpy(hashState256_(state).DoublePipe, i256p2,  16 * sizeof(u_int32_t));
	#endif
	
	data32 = (u_int32_t *) in;
	iterations = databyteLength / BlueMidnightWish256_BLOCK_SIZE;
	data32_end = data32 + iterations*16;
	if(iterations > 0)
		Compress256(data32, data32_end, &state);
	
	databyteLength -= BlueMidnightWish256_BLOCK_SIZE * iterations;
	data64 = (u_int64_t *)hashState256_(state).LastPart;
	
	if (databyteLength < 56) {
		#ifdef __SSE__
		// Use SSE here, if it is available
		__m128i zero = _mm_setzero_si128();
		_mm_store_si128((__m128i *) &data64[0], zero);
		_mm_store_si128((__m128i *) &data64[2], zero);
		_mm_store_si128((__m128i *) &data64[4], zero);
		_mm_store_si128((__m128i *) &data64[6], zero);
		#elif defined ( __x86_64__ )
		// Or 64-bit writes if on 64 bit system (not really possible on x86)
		data64[0] = 0;
		data64[1] = 0;
		data64[2] = 0;
		data64[3] = 0;
		data64[4] = 0;
		data64[5] = 0;
		data64[6] = 0;
		data64[7] = 0;
		#else
		// Fallback
		memset( data64 + (databyteLength >> 4), 0x00, BlueMidnightWish256_BLOCK_SIZE - ((databyteLength >> 4) << 3));
		#endif
		
		
	}
Example #22
0
static pstatus_t sse2_set_32u(
    UINT32 val,
    UINT32* pDst,
    UINT32 len)
{
	const primitives_t* prim = primitives_get_generic();
	UINT32* dptr = (UINT32*) pDst;
	__m128i xmm0;
	size_t count;

	/* If really short, just do it here. */
	if (len < 32)
	{
		while (len--) *dptr++ = val;

		return PRIMITIVES_SUCCESS;
	}

	/* Assure we can reach 16-byte alignment. */
	if (((ULONG_PTR) dptr & 0x03) != 0)
	{
		return prim->set_32u(val, pDst, len);
	}

	/* Seek 16-byte alignment. */
	while ((ULONG_PTR) dptr & 0x0f)
	{
		*dptr++ = val;

		if (--len == 0) return PRIMITIVES_SUCCESS;
	}

	xmm0 = _mm_set1_epi32(val);
	/* Cover 256-byte chunks via SSE register stores. */
	count = len >> 6;
	len -= count << 6;

	/* Do 256-byte chunks using one XMM register. */
	while (count--)
	{
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 4;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 4;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 4;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 4;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 4;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 4;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 4;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 4;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 4;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 4;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 4;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 4;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 4;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 4;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 4;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 4;
	}

	/* Cover 16-byte chunks via SSE register stores. */
	count = len >> 2;
	len -= count << 2;

	/* Do 16-byte chunks using one XMM register. */
	while (count--)
	{
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 4;
	}

	/* Do leftover bytes. */
	while (len--) *dptr++ = val;

	return PRIMITIVES_SUCCESS;
}
PRBool
gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
                                   const gfxImageSurface* whiteSurf)
{
    gfxIntSize size = blackSurf->GetSize();

    if (size != whiteSurf->GetSize() ||
            (blackSurf->Format() != gfxASurface::ImageFormatARGB32 &&
             blackSurf->Format() != gfxASurface::ImageFormatRGB24) ||
            (whiteSurf->Format() != gfxASurface::ImageFormatARGB32 &&
             whiteSurf->Format() != gfxASurface::ImageFormatRGB24))
        return PR_FALSE;

    blackSurf->Flush();
    whiteSurf->Flush();

    unsigned char* blackData = blackSurf->Data();
    unsigned char* whiteData = whiteSurf->Data();

    if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) ||
            (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
        // Cannot keep these in alignment.
        return PR_FALSE;
    }

    __m128i greenMask = _mm_load_si128((__m128i*)greenMaski);
    __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski);

    for (PRInt32 i = 0; i < size.height; ++i) {
        PRInt32 j = 0;
        // Loop single pixels until at 4 byte alignment.
        while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
            *((PRUint32*)blackData) =
                RecoverPixel(*reinterpret_cast<PRUint32*>(blackData),
                             *reinterpret_cast<PRUint32*>(whiteData));
            blackData += 4;
            whiteData += 4;
            j++;
        }
        // This extra loop allows the compiler to do some more clever registry
        // management and makes it about 5% faster than with only the 4 pixel
        // at a time loop.
        for (; j < size.width - 8; j += 8) {
            __m128i black1 = _mm_load_si128((__m128i*)blackData);
            __m128i white1 = _mm_load_si128((__m128i*)whiteData);
            __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16));
            __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16));

            // Execute the same instructions as described in RecoverPixel, only
            // using an SSE2 packed saturated subtract.
            white1 = _mm_subs_epu8(white1, black1);
            white2 = _mm_subs_epu8(white2, black2);
            white1 = _mm_subs_epu8(greenMask, white1);
            white2 = _mm_subs_epu8(greenMask, white2);
            // Producing the final black pixel in an XMM register and storing
            // that is actually faster than doing a masked store since that
            // does an unaligned storage. We have the black pixel in a register
            // anyway.
            black1 = _mm_andnot_si128(alphaMask, black1);
            black2 = _mm_andnot_si128(alphaMask, black2);
            white1 = _mm_slli_si128(white1, 2);
            white2 = _mm_slli_si128(white2, 2);
            white1 = _mm_and_si128(alphaMask, white1);
            white2 = _mm_and_si128(alphaMask, white2);
            black1 = _mm_or_si128(white1, black1);
            black2 = _mm_or_si128(white2, black2);

            _mm_store_si128((__m128i*)blackData, black1);
            _mm_store_si128((__m128i*)(blackData + 16), black2);
            blackData += 32;
            whiteData += 32;
        }
        for (; j < size.width - 4; j += 4) {
            __m128i black = _mm_load_si128((__m128i*)blackData);
            __m128i white = _mm_load_si128((__m128i*)whiteData);

            white = _mm_subs_epu8(white, black);
            white = _mm_subs_epu8(greenMask, white);
            black = _mm_andnot_si128(alphaMask, black);
            white = _mm_slli_si128(white, 2);
            white = _mm_and_si128(alphaMask, white);
            black = _mm_or_si128(white, black);
            _mm_store_si128((__m128i*)blackData, black);
            blackData += 16;
            whiteData += 16;
        }
        // Loop single pixels until we're done.
        while (j < size.width) {
            *((PRUint32*)blackData) =
                RecoverPixel(*reinterpret_cast<PRUint32*>(blackData),
                             *reinterpret_cast<PRUint32*>(whiteData));
            blackData += 4;
            whiteData += 4;
            j++;
        }
        blackData += blackSurf->Stride() - j * 4;
        whiteData += whiteSurf->Stride() - j * 4;
    }

    blackSurf->MarkDirty();

    return PR_TRUE;
}
Example #24
0
static pstatus_t sse2_set_8u(
    BYTE val,
    BYTE* pDst,
    UINT32 len)
{
	BYTE byte, *dptr;
	__m128i xmm0;
	size_t count;

	if (len < 16) return generic->set_8u(val, pDst, len);

	byte  = val;
	dptr = (BYTE*) pDst;

	/* Seek 16-byte alignment. */
	while ((ULONG_PTR) dptr & 0x0f)
	{
		*dptr++ = byte;

		if (--len == 0) return PRIMITIVES_SUCCESS;
	}

	xmm0 = _mm_set1_epi8(byte);
	/* Cover 256-byte chunks via SSE register stores. */
	count = len >> 8;
	len -= count << 8;

	/* Do 256-byte chunks using one XMM register. */
	while (count--)
	{
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 16;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 16;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 16;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 16;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 16;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 16;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 16;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 16;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 16;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 16;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 16;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 16;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 16;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 16;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 16;
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 16;
	}

	/* Cover 16-byte chunks via SSE register stores. */
	count = len >> 4;
	len -= count << 4;

	/* Do 16-byte chunks using one XMM register. */
	while (count--)
	{
		_mm_store_si128((__m128i*) dptr, xmm0);
		dptr += 16;
	}

	/* Do leftover bytes. */
	while (len--) *dptr++ = byte;

	return PRIMITIVES_SUCCESS;
}
static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
                                         ptrdiff_t src_pixels_per_line,
                                         uint8_t *output_ptr,
                                         ptrdiff_t output_pitch,
                                         uint32_t output_height,
                                         const int16_t *filter) {
  __m128i filtersReg;
  __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
  __m256i srcReg32b1, srcReg32b2, filtersReg32;
  unsigned int i;
  ptrdiff_t src_stride, dst_stride;

  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  // converting the 16 bit (short) to 8 bit (byte) and have the same data
  // in both lanes of 128 bit register.
  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
  // have the same data in both lanes of a 256 bit register
  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);

  // duplicate only the first 16 bits (first and second byte)
  // across 256 bit register
  firstFilters = _mm256_shuffle_epi8(filtersReg32,
                 _mm256_set1_epi16(0x100u));
  // duplicate only the second 16 bits (third and forth byte)
  // across 256 bit register
  secondFilters = _mm256_shuffle_epi8(filtersReg32,
                  _mm256_set1_epi16(0x302u));
  // duplicate only the third 16 bits (fifth and sixth byte)
  // across 256 bit register
  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
                 _mm256_set1_epi16(0x504u));
  // duplicate only the forth 16 bits (seventh and eighth byte)
  // across 256 bit register
  forthFilters = _mm256_shuffle_epi8(filtersReg32,
                 _mm256_set1_epi16(0x706u));

  filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
  filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
  filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
  filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);

  // multiple the size of the source and destination stride by two
  src_stride = src_pixels_per_line << 1;
  dst_stride = output_pitch << 1;
  for (i = output_height; i > 1; i-=2) {
    // load the 2 strides of source
    srcReg32b1 = _mm256_castsi128_si256(
                 _mm_loadu_si128((const __m128i *)(src_ptr - 3)));
    srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
                 _mm_loadu_si128((const __m128i *)
                 (src_ptr+src_pixels_per_line-3)), 1);

    // filter the source buffer
    srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);

    // add and saturate the results together
    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);

    // filter the source buffer
    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);

    // add and saturate the results together
    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));

    // reading 2 strides of the next 16 bytes
    // (part of it was being read by earlier read)
    srcReg32b2 = _mm256_castsi128_si256(
                 _mm_loadu_si128((const __m128i *)(src_ptr + 5)));
    srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
                 _mm_loadu_si128((const __m128i *)
                 (src_ptr+src_pixels_per_line+5)), 1);

    // add and saturate the results together
    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));

    // filter the source buffer
    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);

    // add and saturate the results together
    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);

    // filter the source buffer
    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);

    // add and saturate the results together
    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));


    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);

    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);

    // shift by 7 bit each 16 bit
    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve
    // result
    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,
                                           srcRegFilt32b2_1);

    src_ptr+=src_stride;

    // save 16 bytes
    _mm_store_si128((__m128i*)output_ptr,
    _mm256_castsi256_si128(srcRegFilt32b1_1));

    // save the next 16 bits
    _mm_store_si128((__m128i*)(output_ptr+output_pitch),
    _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
    output_ptr+=dst_stride;
  }

  // if the number of strides is odd.
  // process only 16 bytes
  if (i > 0) {
    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
    __m128i srcRegFilt2, srcRegFilt3;

    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));

    // filter the source buffer
    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
                    _mm256_castsi256_si128(filt1Reg));
    srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
                  _mm256_castsi256_si128(filt4Reg));

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
                    _mm256_castsi256_si128(firstFilters));
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
                  _mm256_castsi256_si128(forthFilters));

    // add and saturate the results together
    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);

    // filter the source buffer
    srcRegFilt3= _mm_shuffle_epi8(srcReg1,
                 _mm256_castsi256_si128(filt2Reg));
    srcRegFilt2= _mm_shuffle_epi8(srcReg1,
                 _mm256_castsi256_si128(filt3Reg));

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
                  _mm256_castsi256_si128(secondFilters));
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
                  _mm256_castsi256_si128(thirdFilters));

    // add and saturate the results together
    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));

    // reading the next 16 bytes
    // (part of it was being read by earlier read)
    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));

    // add and saturate the results together
    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));

    // filter the source buffer
    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
                    _mm256_castsi256_si128(filt1Reg));
    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
                  _mm256_castsi256_si128(filt4Reg));

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
                    _mm256_castsi256_si128(firstFilters));
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
                  _mm256_castsi256_si128(forthFilters));

    // add and saturate the results together
    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);

    // filter the source buffer
    srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
                  _mm256_castsi256_si128(filt2Reg));
    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
                  _mm256_castsi256_si128(filt3Reg));

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
                  _mm256_castsi256_si128(secondFilters));
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
                  _mm256_castsi256_si128(thirdFilters));

    // add and saturate the results together
    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));


    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
                    _mm256_castsi256_si128(addFilterReg64));

    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
                    _mm256_castsi256_si128(addFilterReg64));

    // shift by 7 bit each 16 bit
    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve
    // result
    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);

    // save 16 bytes
    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
  }
}
Example #26
0
  _declspec(dllexport) DiffResult __stdcall diff_img(Image left, Image right, DiffOptions options) {
    if (options.ignoreColor) {
      makeGreyscale(left);
      makeGreyscale(right);
    }

    float* imgMem = (float*)_aligned_malloc(left.width * left.height * sizeof(float) * 4, 16);
    int colorOffset = left.width * left.height;
    Image diff = { left.width, left.height, left.stride, imgMem, imgMem + colorOffset, imgMem + colorOffset * 2, imgMem + colorOffset * 3 };

    float* drp = diff.r;
    float* dgp = diff.g;
    float* dbp = diff.b;
    float* dap = diff.a;

    float* lrp = left.r;
    float* lgp = left.g;
    float* lbp = left.b;
    float* lap = left.a;

    float* rrp = right.r;
    float* rgp = right.g;
    float* rbp = right.b;
    float* rap = right.a;

    Color error = ConvertToFloat(options.errorColor);

    auto er = _mm_set_ps1(error.r);
    auto eg = _mm_set_ps1(error.g);
    auto eb = _mm_set_ps1(error.b);
    auto ea = _mm_set_ps1(error.a);

    auto tolerance = _mm_set_ps1(options.tolerance);
    auto overlayTransparency = _mm_set_ps1(options.overlayTransparency);

    OverlayType overlayType = options.overlayType;
    byte weightByDiffPercentage = options.weightByDiffPercentage;

    auto diffPixelCount = _mm_set_epi32(0, 0, 0, 0);
    auto onei = _mm_set1_epi32(1);
    auto one = _mm_set1_ps(1);
    auto zero = _mm_set1_ps(0);

    for (int y = 0; y < left.height; y++) {
      for (int x = 0; x < left.width; x+=4) {
        auto lr = _mm_load_ps(lrp);
        auto lg = _mm_load_ps(lgp);
        auto lb = _mm_load_ps(lbp);
        auto la = _mm_load_ps(lap);

        auto rr = _mm_load_ps(rrp);
        auto rg = _mm_load_ps(rgp);
        auto rb = _mm_load_ps(rbp);
        auto ra = _mm_load_ps(rap);

        auto rdiff = _mm_sub_ps(rr, lr);
        auto gdiff = _mm_sub_ps(rg, lg);
        auto bdiff = _mm_sub_ps(rb, lb);
        auto adiff = _mm_sub_ps(ra, la);

        auto distance = _mm_mul_ps(rdiff, rdiff);
        distance = _mm_add_ps(distance, _mm_mul_ps(gdiff, gdiff));
        distance = _mm_add_ps(distance, _mm_mul_ps(bdiff, bdiff));
        distance = _mm_add_ps(distance, _mm_mul_ps(adiff, adiff));
        distance = _mm_sqrt_ps(distance);

        auto t = overlayTransparency;
        if (weightByDiffPercentage) {
          t = _mm_mul_ps(t, distance);
        }

        auto isdiff = _mm_cmpgt_ps(distance, tolerance);

        t = _mm_min_ps(one, _mm_max_ps(zero, t));
        auto mlr = rr;
        auto mlg = rg;
        auto mlb = rb;
        auto mla = ra;

        if (overlayType == OverlayType::Movement) {
          mlr = _mm_mul_ps(mlr, er);
          mlg = _mm_mul_ps(mlg, eg);
          mlb = _mm_mul_ps(mlb, eb);
          mla = _mm_mul_ps(mla, ea);
        }

        auto oneMinusT = _mm_sub_ps(one, t);

        auto mixedR = _mm_add_ps(_mm_mul_ps(mlr, oneMinusT), _mm_mul_ps(er, t));
        auto mixedG = _mm_add_ps(_mm_mul_ps(mlg, oneMinusT), _mm_mul_ps(eg, t));
        auto mixedB = _mm_add_ps(_mm_mul_ps(mlb, oneMinusT), _mm_mul_ps(eb, t));
        auto mixedA = one;

        if (overlayType != OverlayType::Movement) {
          mixedA = _mm_add_ps(_mm_mul_ps(mla, oneMinusT), _mm_mul_ps(ea, t));
        }

        // (((b ^ a) & mask)^a)
        auto dr = _mm_xor_ps(lr, _mm_and_ps(isdiff, _mm_xor_ps(mixedR, lr)));
        auto dg = _mm_xor_ps(lg, _mm_and_ps(isdiff, _mm_xor_ps(mixedG, lg)));
        auto db = _mm_xor_ps(lb, _mm_and_ps(isdiff, _mm_xor_ps(mixedB, lb)));
        auto da = _mm_xor_ps(la, _mm_and_ps(isdiff, _mm_xor_ps(mixedA, la)));

        diffPixelCount = _mm_xor_si128(diffPixelCount,
          _mm_and_si128(_mm_castps_si128(isdiff),
            _mm_xor_si128(_mm_add_epi32(diffPixelCount, onei),
              diffPixelCount)));

        _mm_store_ps(drp, dr);
        _mm_store_ps(dgp, dg);
        _mm_store_ps(dbp, db);
        _mm_store_ps(dap, da);

        drp+=4;
        dgp+=4;
        dbp+=4;
        dap+=4;

        lrp+=4;
        lgp+=4;
        lbp+=4;
        lap+=4;

        rrp+=4;
        rgp+=4;
        rbp+=4;
        rap+=4;
      }
    }

    int* pixelCounts = (int*)_aligned_malloc(4 * sizeof(int), 16);
    _mm_store_si128((__m128i*)pixelCounts, diffPixelCount);

    int totalCount = pixelCounts[0] + pixelCounts[1] + pixelCounts[2] + pixelCounts[3];
    _aligned_free(pixelCounts);

    return{ diff, 1.0f - float(totalCount) / (left.height * left.width - left.height * left.stride) };
  }
Example #27
0
void fb_slvn_low(dig_t *c, const dig_t *a) {
	int i;
	dig_t *p, u0, u1, u2, u3;
	void *tab = fb_poly_get_slv();
	__m128i m0, m1, m2, m3, m4, sqrt0, sqrt1, mask0, mask1, mask2, r0, r1, t0, t1, perm;

	perm = _mm_set_epi32(0x0F0D0B09, 0x07050301, 0x0E0C0A08, 0x06040200);
	mask2 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000);
	mask1 = _mm_set_epi32(0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0);
	mask0 = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F);
	sqrt0 = _mm_set_epi32(0x03020302, 0x01000100, 0x03020302, 0x01000100);
	sqrt1 = _mm_set_epi32(0x0c080c08, 0x04000400, 0x0c080c08, 0x04000400);

	t0 = _mm_load_si128((__m128i *)a);
	t1 = _mm_load_si128((__m128i *)(a + 2));
	r0 = r1 = _mm_setzero_si128();

	m0 = _mm_shuffle_epi8(t1, perm);
	m1 = _mm_and_si128(m0, mask0);
	m2 = _mm_and_si128(m0, mask1);
	m2 = _mm_srli_epi64(m2, 4);
	m2 = _mm_shuffle_epi8(sqrt1, m2);
	m1 = _mm_shuffle_epi8(sqrt0, m1);
	m1 = _mm_xor_si128(m1, m2);

	m2 = _mm_slli_si128(m1, 8);
	m1 = _mm_and_si128(m1, mask2);
	m1 = _mm_slli_epi64(m1, 4);
	m1 = _mm_xor_si128(m1, m2);
	t0 = _mm_xor_si128(t0, m1);
	r0 = _mm_xor_si128(r0, m1);

	m0 = _mm_and_si128(t0, mask2);
	m0 = _mm_shuffle_epi8(m0, perm);
	m1 = _mm_and_si128(m0, mask0);
	m2 = _mm_and_si128(m0, mask1);
	m2 = _mm_srli_epi64(m2, 4);
	m2 = _mm_shuffle_epi8(sqrt1, m2);
	m1 = _mm_shuffle_epi8(sqrt0, m1);
	m1 = _mm_xor_si128(m1, m2);

	m2 = _mm_srli_si128(m1, 8);
	m1 = _mm_andnot_si128(mask2, m1);
	m2 = _mm_slli_epi64(m2, 4);
	m1 = _mm_xor_si128(m1, m2);
	t0 = _mm_xor_si128(t0, m1);
	r0 = _mm_xor_si128(r0, m1);

	m1 = _mm_srli_si128(t0, 4);
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0xFFFFFFFF));
	m0 = _mm_shuffle_epi8(m1, perm);
	m1 = _mm_and_si128(m0, mask0);
	m2 = _mm_and_si128(m0, mask1);
	m2 = _mm_srli_epi64(m2, 4);
	m2 = _mm_shuffle_epi8(sqrt1, m2);
	m1 = _mm_shuffle_epi8(sqrt0, m1);
	m1 = _mm_xor_si128(m1, m2);
	m2 = _mm_slli_si128(m1, 8);
	m1 = _mm_slli_epi64(m1, 4);
	m1 = _mm_xor_si128(m1, m2);
	m1 = _mm_srli_si128(m1, 6);
	t0 = _mm_xor_si128(t0, m1);
	r0 = _mm_xor_si128(r0, m1);

	m1 = _mm_srli_si128(t0, 2);
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0xFFFF));
	m0 = _mm_shuffle_epi8(m1, perm);
	m1 = _mm_and_si128(m0, mask0);
	m2 = _mm_and_si128(m0, mask1);
	m2 = _mm_srli_epi64(m2, 4);
	m2 = _mm_shuffle_epi8(sqrt1, m2);
	m1 = _mm_shuffle_epi8(sqrt0, m1);
	m1 = _mm_xor_si128(m1, m2);
	m2 = _mm_slli_si128(m1, 8);
	m1 = _mm_slli_epi64(m1, 4);
	m1 = _mm_xor_si128(m1, m2);
	m1 = _mm_srli_si128(m1, 7);
	t0 = _mm_xor_si128(t0, m1);
	r0 = _mm_xor_si128(r0, m1);

	m1 = _mm_srli_si128(t0, 1);
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x55));
	m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 1));
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x33));
	m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 2));
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x0F));
	m1 = _mm_slli_epi64(m1, 4);
	t0 = _mm_xor_si128(t0, m1);
	r0 = _mm_xor_si128(r0, m1);

	m1 = _mm_srli_epi64(t0, 4);
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x5));
	m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 1));
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x3));
	m1 = _mm_slli_epi64(m1, 2);
	t0 = _mm_xor_si128(t0, m1);
	r0 = _mm_xor_si128(r0, m1);

	m1 = _mm_srli_epi64(t0, 2);
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x1));
	m1 = _mm_slli_epi64(m1, 1);
	t0 = _mm_xor_si128(t0, m1);
	r0 = _mm_xor_si128(r0, m1);

	sqrt0 = _mm_set_epi32(0x03030202, 0x03030202, 0x01010000, 0x01010000);
	sqrt1 = _mm_set_epi32(0x0C0C0808, 0x0C0C0808, 0x04040000, 0x04040000);

	m1 = _mm_and_si128(t0, mask0);
	m2 = _mm_and_si128(t0, mask1);
	m3 = _mm_and_si128(t1, mask0);
	m4 = _mm_and_si128(t1, mask1);
	m2 = _mm_srli_epi64(m2, 4);
	m4 = _mm_srli_epi64(m4, 4);
	m2 = _mm_shuffle_epi8(sqrt1, m2);
	m1 = _mm_shuffle_epi8(sqrt0, m1);
	m4 = _mm_shuffle_epi8(sqrt1, m4);
	m3 = _mm_shuffle_epi8(sqrt0, m3);
	m1 = _mm_or_si128(m1, m2);
	m3 = _mm_or_si128(m3, m4);
#ifndef __PCLMUL__
	align dig_t x[2];
	_mm_store_si128((__m128i *)x, m1);
	u0 = x[0];
	u1 = x[1];
	_mm_store_si128((__m128i *)x, m3);
	u2 = x[0];
	u3 = x[1];
#else
	u0 = _mm_extract_epi64(m1, 0);
	u1 = _mm_extract_epi64(m1, 1);
	u2 = _mm_extract_epi64(m3, 0);
	u3 = _mm_extract_epi64(m3, 1);
#endif

	for (i = 0; i < 8; i++) {
		p = (dig_t *)(tab + (16 * i + (u0 & 0x0F)) * sizeof(fb_st));
		r0 = _mm_xor_si128(r0, *(__m128i *)(p));
		r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2));
		u0 >>= 8;
		p = (dig_t *)(tab + (16 * (i + 8) + (u1 & 0x0F)) * sizeof(fb_st));
		r0 = _mm_xor_si128(r0, *(__m128i *)(p));
		r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2));
		u1 >>= 8;
		p = (dig_t *)(tab + (16 * (i + 16) + (u2 & 0x0F)) * sizeof(fb_st));
		r0 = _mm_xor_si128(r0, *(__m128i *)(p));
		r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2));
		u2 >>= 8;
		p = (dig_t *)(tab + (16 * (i + 24) + (u3 & 0xF)) * sizeof(fb_st));
		r0 = _mm_xor_si128(r0, *(__m128i *)(p));
		r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2));
		u3 >>= 8;
	}

	_mm_store_si128((__m128i *)c, r0);
	_mm_store_si128((__m128i *)(c + 2), r1);
}
Example #28
0
void BM3D_Basic_Process::CollaborativeFilter(int plane,
    FLType *ResNum, FLType *ResDen,
    const FLType *src, const FLType *ref,
    const PosPairCode &code) const
{
    PCType GroupSize = static_cast<PCType>(code.size());
    // When para.GroupSize > 0, limit GroupSize up to para.GroupSize
    if (d.para.GroupSize > 0 && GroupSize > d.para.GroupSize)
    {
        GroupSize = d.para.GroupSize;
    }

    // Construct source group guided by matched pos code
    block_group srcGroup(src, src_stride[plane], code, GroupSize, d.para.BlockSize, d.para.BlockSize);

    // Initialize retianed coefficients of hard threshold filtering
    int retainedCoefs = 0;

    // Apply forward 3D transform to the source group
    d.f[plane].fp[GroupSize - 1].execute_r2r(srcGroup.data(), srcGroup.data());

    // Apply hard-thresholding to the source group
    auto srcp = srcGroup.data();
    auto thrp = d.f[plane].thrTable[GroupSize - 1].get();
    const auto upper = srcp + srcGroup.size();

#if defined(__SSE2__)
    static const ptrdiff_t simd_step = 4;
    const ptrdiff_t simd_residue = srcGroup.size() % simd_step;
    const ptrdiff_t simd_width = srcGroup.size() - simd_residue;

    static const __m128 zero_ps = _mm_setzero_ps();
    __m128i cmp_sum = _mm_setzero_si128();

    for (const auto upper1 = srcp + simd_width; srcp < upper1; srcp += simd_step, thrp += simd_step)
    {
        const __m128 s1 = _mm_load_ps(srcp);
        const __m128 t1p = _mm_load_ps(thrp);
        const __m128 t1n = _mm_sub_ps(zero_ps, t1p);

        const __m128 cmp1 = _mm_cmpgt_ps(s1, t1p);
        const __m128 cmp2 = _mm_cmplt_ps(s1, t1n);
        const __m128 cmp = _mm_or_ps(cmp1, cmp2);

        const __m128 d1 = _mm_and_ps(cmp, s1);
        _mm_store_ps(srcp, d1);
        cmp_sum = _mm_sub_epi32(cmp_sum, _mm_castps_si128(cmp));
    }

    alignas(16) int32_t cmp_sum_i32[4];
    _mm_store_si128(reinterpret_cast<__m128i *>(cmp_sum_i32), cmp_sum);
    retainedCoefs += cmp_sum_i32[0] + cmp_sum_i32[1] + cmp_sum_i32[2] + cmp_sum_i32[3];
#endif

    for (; srcp < upper; ++srcp, ++thrp)
    {
        if (*srcp > *thrp || *srcp < -*thrp)
        {
            ++retainedCoefs;
        }
        else
        {
            *srcp = 0;
        }
    }

    // Apply backward 3D transform to the filtered group
    d.f[plane].bp[GroupSize - 1].execute_r2r(srcGroup.data(), srcGroup.data());

    // Calculate weight for the filtered group
    // Also include the normalization factor to compensate for the amplification introduced in 3D transform
    FLType denWeight = retainedCoefs < 1 ? 1 : FLType(1) / static_cast<FLType>(retainedCoefs);
    FLType numWeight = static_cast<FLType>(denWeight / d.f[plane].finalAMP[GroupSize - 1]);

    // Store the weighted filtered group to the numerator part of the basic estimation
    // Store the weight to the denominator part of the basic estimation
    srcGroup.AddTo(ResNum, dst_stride[plane], numWeight);
    srcGroup.CountTo(ResDen, dst_stride[plane], denWeight);
}
Example #29
0
rfx_dwt_2d_decode_block_horiz_sse2(INT16* l, INT16* h, INT16* dst, int subband_width)
{
	int y, n;
	INT16* l_ptr = l;
	INT16* h_ptr = h;
	INT16* dst_ptr = dst;
	int first;
	int last;
	__m128i l_n;
	__m128i h_n;
	__m128i h_n_m;
	__m128i tmp_n;
	__m128i dst_n;
	__m128i dst_n_p;
	__m128i dst1;
	__m128i dst2;

	for (y = 0; y < subband_width; y++)
	{
		/* Even coefficients */
		for (n = 0; n < subband_width; n += 8)
		{
			/* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
			
			l_n = _mm_load_si128((__m128i*) l_ptr);

			h_n = _mm_load_si128((__m128i*) h_ptr);
			h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1));

			if (n == 0)
			{
				first = _mm_extract_epi16(h_n_m, 1);
				h_n_m = _mm_insert_epi16(h_n_m, first, 0);
			}
			
			tmp_n = _mm_add_epi16(h_n, h_n_m);
			tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1));
			tmp_n = _mm_srai_epi16(tmp_n, 1);
			
			dst_n = _mm_sub_epi16(l_n, tmp_n);
			
			_mm_store_si128((__m128i*) l_ptr, dst_n);
			
			l_ptr += 8;
			h_ptr += 8;
		}

		l_ptr -= subband_width;
		h_ptr -= subband_width;
		
		/* Odd coefficients */
		for (n = 0; n < subband_width; n += 8)
		{
			/* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
			
			h_n = _mm_load_si128((__m128i*) h_ptr);
			
			h_n = _mm_slli_epi16(h_n, 1);
			
			dst_n = _mm_load_si128((__m128i*) (l_ptr));
			dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1));

			if (n == subband_width - 8)
			{
				last = _mm_extract_epi16(dst_n_p, 6);
				dst_n_p = _mm_insert_epi16(dst_n_p, last, 7);
			}
			
			tmp_n = _mm_add_epi16(dst_n_p, dst_n);
			tmp_n = _mm_srai_epi16(tmp_n, 1);
			
			tmp_n = _mm_add_epi16(tmp_n, h_n);
			
			dst1 = _mm_unpacklo_epi16(dst_n, tmp_n);
			dst2 = _mm_unpackhi_epi16(dst_n, tmp_n);
			
			_mm_store_si128((__m128i*) dst_ptr, dst1);
			_mm_store_si128((__m128i*) (dst_ptr + 8), dst2);
			
			l_ptr += 8;
			h_ptr += 8;
			dst_ptr += 16;
		}
	}
}
Example #30
0
	EvalSum& operator = (const EvalSum& rhs) {
		_mm_store_si128(&m[0], rhs.m[0]);
		_mm_store_si128(&m[1], rhs.m[1]);
		return *this;
	}