void demod_64qam_lte_b_sse(const cf_t *symbols, int8_t *llr, int nsymbols) { float *symbolsPtr = (float*) symbols; __m128i *resultPtr = (__m128i*) llr; __m128 symbol1, symbol2, symbol3, symbol4; __m128i symbol_i1, symbol_i2, symbol_i3, symbol_i4, symbol_i, symbol_abs, symbol_abs2,symbol_12, symbol_34; __m128i offset1 = _mm_set1_epi8(4*SCALE_BYTE_CONV_QAM64/sqrt(42)); __m128i offset2 = _mm_set1_epi8(2*SCALE_BYTE_CONV_QAM64/sqrt(42)); __m128 scale_v = _mm_set1_ps(-SCALE_BYTE_CONV_QAM64); __m128i result11, result12, result13, result22, result21,result23, result31, result32, result33; __m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,5,4,0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0); __m128i shuffle_negated_2 = _mm_set_epi8(11,10,0xff,0xff,0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff); __m128i shuffle_negated_3 = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff); __m128i shuffle_abs_1 = _mm_set_epi8(5,4,0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0,0xff,0xff); __m128i shuffle_abs_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff,0xff,0xff); __m128i shuffle_abs_3 = _mm_set_epi8(0xff,0xff,15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff,11,10); __m128i shuffle_abs2_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,3,2,0xff,0xff,0xff,0xff,1,0,0xff,0xff,0xff,0xff); __m128i shuffle_abs2_2 = _mm_set_epi8(0xff,0xff,9,8,0xff,0xff,0xff,0xff,7,6,0xff,0xff,0xff,0xff,5,4); __m128i shuffle_abs2_3 = _mm_set_epi8(15,14,0xff,0xff,0xff,0xff,13,12,0xff,0xff,0xff,0xff,11,10,0xff,0xff); for (int i=0;i<nsymbols/8;i++) { symbol1 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol2 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol3 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol4 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v)); symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v)); symbol_i3 = _mm_cvtps_epi32(_mm_mul_ps(symbol3, scale_v)); symbol_i4 = _mm_cvtps_epi32(_mm_mul_ps(symbol4, scale_v)); symbol_12 = _mm_packs_epi32(symbol_i1, symbol_i2); symbol_34 = _mm_packs_epi32(symbol_i3, symbol_i4); symbol_i = _mm_packs_epi16(symbol_12, symbol_34); symbol_abs = _mm_abs_epi8(symbol_i); symbol_abs = _mm_sub_epi8(symbol_abs, offset1); symbol_abs2 = _mm_sub_epi8(_mm_abs_epi8(symbol_abs), offset2); result11 = _mm_shuffle_epi8(symbol_i, shuffle_negated_1); result12 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1); result13 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_1); result21 = _mm_shuffle_epi8(symbol_i, shuffle_negated_2); result22 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2); result23 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_2); result31 = _mm_shuffle_epi8(symbol_i, shuffle_negated_3); result32 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_3); result33 = _mm_shuffle_epi8(symbol_abs2, shuffle_abs2_3); _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result11, result12),result13)); resultPtr++; _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result21, result22),result23)); resultPtr++; _mm_store_si128(resultPtr, _mm_or_si128(_mm_or_si128(result31, result32),result33)); resultPtr++; } for (int i=8*(nsymbols/8);i<nsymbols;i++) { float yre = (int8_t) (SCALE_BYTE_CONV_QAM64*crealf(symbols[i])); float yim = (int8_t) (SCALE_BYTE_CONV_QAM64*cimagf(symbols[i])); llr[6*i+0] = -yre; llr[6*i+1] = -yim; llr[6*i+2] = abs(yre)-4*SCALE_BYTE_CONV_QAM64/sqrt(42); llr[6*i+3] = abs(yim)-4*SCALE_BYTE_CONV_QAM64/sqrt(42); llr[6*i+4] = abs(llr[6*i+2])-2*SCALE_BYTE_CONV_QAM64/sqrt(42); llr[6*i+5] = abs(llr[6*i+3])-2*SCALE_BYTE_CONV_QAM64/sqrt(42); } }
bool WidgetAugmentedView::render() { if (!stream) return false; stream->getColorFrame(colorFrame); stream->getDepthFrame(depthFrame); // Correct the depth map if (depthCorrector == nullptr) depthBuffer = depthFrame; else depthCorrector->correct(depthFrame, depthBuffer); // Setup perspective glMatrixMode(GL_PROJECTION); glLoadIdentity(); gluPerspective(fovY, float(ColorFrame::WIDTH) / float(ColorFrame::HEIGHT), zNear, zFar); glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glEnable(GL_DEPTH_TEST); glColor4f(1.0f, 1.0f, 1.0f, 1.0f); // // Draw real world (2D color image) // glDepthFunc(GL_ALWAYS); glActiveTexture(GL_TEXTURE0); glBindTexture(GL_TEXTURE_2D, textureColor); glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, ColorFrame::WIDTH, ColorFrame::HEIGHT, GL_RGBA, GL_UNSIGNED_BYTE, (GLvoid*)colorFrame.pixels); glActiveTexture(GL_TEXTURE1); glBindTexture(GL_TEXTURE_2D, textureDepth); KinectStream* kinect = dynamic_cast<KinectStream*>(stream.obj); if (kinect != nullptr) { kinect->mapColorFrameToDepthFrame(depthBuffer, OUT mapping); const NUI_DEPTH_IMAGE_POINT* src = mapping; GLushort* dest = textureDepthBuffer; GLushort* end = textureDepthBuffer + ColorFrame::SIZE; #define SRC(i) static_cast<short>(static_cast<unsigned short>((src + i)->depth)) #ifndef NOT_VECTORIZED // Vectorized assuming ColorFrame::SIZE % 8 == 0 __m128i min = _mm_set1_epi16(static_cast<short>(DepthFrame::MIN_DEPTH)); __m128i max = _mm_set1_epi16(static_cast<short>(DepthFrame::MAX_DEPTH)); __m128i _0 = _mm_setzero_si128(); for (; dest < end; dest += 8, src += 8) { __m128i v = _mm_set_epi16(SRC(7), SRC(6), SRC(5), SRC(4), SRC(3), SRC(2), SRC(1), SRC(0)); v = _mm_max_epu16(min, _mm_min_epu16(max, v)); v = _mm_blendv_epi8(v, max, _mm_cmpeq_epi16(_0, v)); _mm_store_si128((__m128i*)dest, v); } #else for (; dest < end; ++dest, ++src) { unsigned short s = SRC(0); s = (s > DepthFrame::MAX_DEPTH) ? DepthFrame::MAX_DEPTH : s; s = (s < DepthFrame::MIN_DEPTH) ? DepthFrame::MIN_DEPTH : s; *dest = static_cast<GLushort>(s); } #endif glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, ColorFrame::WIDTH, ColorFrame::HEIGHT, GL_RED_INTEGER, GL_UNSIGNED_SHORT, (GLvoid*)textureDepthBuffer); } glActiveTexture(GL_TEXTURE0); shader2D.bind(); RenderUtils::drawRect(-1.0f, 1.0f, 2.0f, -2.0f); shader2D.release(); // // Draw augmented world // glDepthFunc(GL_LESS); glScalef(1.0f, 1.0f, -1.0f); // Invert Z axis so that +Z is in front of the camera // A plane to test occlusion /*glColor3f(0.0f, 1.0f, 0.0f); glBegin(GL_TRIANGLE_STRIP); glVertex3f(-0.5f, -0.5f, 0.5f); glVertex3f(-0.5f, 0.5f, 2.5f); glVertex3f(0.5f, -0.5f, 2.5f); glVertex3f(0.5f, 0.5f, 4.5f); glEnd();*/ glEnable(GL_LIGHTING); // Draw the objects world.render(renderManager); glDisable(GL_LIGHTING); return true; }
int camCompareDescriptors(const int *desc1, const int *desc2, const int s) { int i, j, distance = 0; __m128i sum, d1, d2, md, d, cmp; __m128i *p1 = (__m128i*)desc1, *p2 = (__m128i*)desc2; ALIGN(int out_sse[4], 16); /* Looks like a good idea... But this deteriorates performance... // Software prefetch d1 = _mm_load_si128(p1); d2 = _mm_load_si128(p2); for (i = 0; i != s; i += 32) { _mm_prefetch(&desc1[i], _MM_HINT_NTA); _mm_prefetch(&desc2[i], _MM_HINT_NTA); } */ sum = _mm_setzero_si128(); for (i = 0; i != s >> 4; i++) { // 32-bits SAD for 4 integers in parallel d1 = _mm_loadu_si128(p1++); d2 = _mm_loadu_si128(p2++); d = _mm_sub_epi32(d1, d2); md = _mm_sub_epi32(d2, d1); cmp = _mm_cmplt_epi32(d, _mm_setzero_si128()); md = _mm_and_si128(cmp, md); d = _mm_andnot_si128(cmp, d); sum = _mm_add_epi32(sum, md); sum = _mm_add_epi32(sum, d); // 32-bits SAD for 4 integers in parallel d1 = _mm_loadu_si128(p1++); d2 = _mm_loadu_si128(p2++); d = _mm_sub_epi32(d1, d2); md = _mm_sub_epi32(d2, d1); cmp = _mm_cmplt_epi32(d, _mm_setzero_si128()); md = _mm_and_si128(cmp, md); d = _mm_andnot_si128(cmp, d); sum = _mm_add_epi32(sum, md); sum = _mm_add_epi32(sum, d); // 32-bits SAD for 4 integers in parallel d1 = _mm_loadu_si128(p1++); d2 = _mm_loadu_si128(p2++); d = _mm_sub_epi32(d1, d2); md = _mm_sub_epi32(d2, d1); cmp = _mm_cmplt_epi32(d, _mm_setzero_si128()); md = _mm_and_si128(cmp, md); d = _mm_andnot_si128(cmp, d); sum = _mm_add_epi32(sum, md); sum = _mm_add_epi32(sum, d); // 32-bits SAD for 4 integers in parallel d1 = _mm_loadu_si128(p1++); d2 = _mm_loadu_si128(p2++); d = _mm_sub_epi32(d1, d2); md = _mm_sub_epi32(d2, d1); cmp = _mm_cmplt_epi32(d, _mm_setzero_si128()); md = _mm_and_si128(cmp, md); d = _mm_andnot_si128(cmp, d); sum = _mm_add_epi32(sum, md); sum = _mm_add_epi32(sum, d); } _mm_store_si128((__m128i*)out_sse, sum); return out_sse[0] + out_sse[1] + out_sse[2] + out_sse[3]; }
pstatus_t sse2_alphaComp_argb( const BYTE* pSrc1, UINT32 src1Step, const BYTE* pSrc2, UINT32 src2Step, BYTE* pDst, UINT32 dstStep, UINT32 width, UINT32 height) { const UINT32* sptr1 = (const UINT32*) pSrc1; const UINT32* sptr2 = (const UINT32*) pSrc2; UINT32* dptr; int linebytes, src1Jump, src2Jump, dstJump; UINT32 y; __m128i xmm0, xmm1; if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS; if (width < 4) /* pointless if too small */ { return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step, pDst, dstStep, width, height); } dptr = (UINT32*) pDst; linebytes = width * sizeof(UINT32); src1Jump = (src1Step - linebytes) / sizeof(UINT32); src2Jump = (src2Step - linebytes) / sizeof(UINT32); dstJump = (dstStep - linebytes) / sizeof(UINT32); xmm0 = _mm_set1_epi32(0); xmm1 = _mm_set1_epi16(1); for (y = 0; y < height; ++y) { int pixels = width; int count; /* Get to the 16-byte boundary now. */ int leadIn = 0; switch ((ULONG_PTR) dptr & 0x0f) { case 0: leadIn = 0; break; case 4: leadIn = 3; break; case 8: leadIn = 2; break; case 12: leadIn = 1; break; default: /* We'll never hit a 16-byte boundary, so do the whole * thing the slow way. */ leadIn = width; break; } if (leadIn) { pstatus_t status; status = generic->alphaComp_argb((const BYTE*) sptr1, src1Step, (const BYTE*) sptr2, src2Step, (BYTE*) dptr, dstStep, leadIn, 1); if (status != PRIMITIVES_SUCCESS) return status; sptr1 += leadIn; sptr2 += leadIn; dptr += leadIn; pixels -= leadIn; } /* Use SSE registers to do 4 pixels at a time. */ count = pixels >> 2; pixels -= count << 2; while (count--) { __m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; /* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */ xmm2 = LOAD_SI128(sptr1); sptr1 += 4; /* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */ xmm3 = LOAD_SI128(sptr2); sptr2 += 4; /* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */ xmm4 = _mm_unpackhi_epi8(xmm2, xmm0); /* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */ xmm5 = _mm_unpackhi_epi8(xmm3, xmm0); /* subtract */ xmm6 = _mm_subs_epi16(xmm4, xmm5); /* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */ xmm4 = _mm_shufflelo_epi16(xmm4, 0xff); /* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */ xmm4 = _mm_shufflehi_epi16(xmm4, 0xff); /* Add one to alphas */ xmm4 = _mm_adds_epi16(xmm4, xmm1); /* Multiply and take low word */ xmm4 = _mm_mullo_epi16(xmm4, xmm6); /* Shift 8 right */ xmm4 = _mm_srai_epi16(xmm4, 8); /* Add xmm5 */ xmm4 = _mm_adds_epi16(xmm4, xmm5); /* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */ /* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */ xmm5 = _mm_unpacklo_epi8(xmm2, xmm0); /* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */ xmm6 = _mm_unpacklo_epi8(xmm3, xmm0); /* subtract */ xmm7 = _mm_subs_epi16(xmm5, xmm6); /* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */ xmm5 = _mm_shufflelo_epi16(xmm5, 0xff); /* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */ xmm5 = _mm_shufflehi_epi16(xmm5, 0xff); /* Add one to alphas */ xmm5 = _mm_adds_epi16(xmm5, xmm1); /* Multiply and take low word */ xmm5 = _mm_mullo_epi16(xmm5, xmm7); /* Shift 8 right */ xmm5 = _mm_srai_epi16(xmm5, 8); /* Add xmm6 */ xmm5 = _mm_adds_epi16(xmm5, xmm6); /* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */ /* Must mask off remainders or pack gets confused */ xmm3 = _mm_set1_epi16(0x00ffU); xmm4 = _mm_and_si128(xmm4, xmm3); xmm5 = _mm_and_si128(xmm5, xmm3); /* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */ xmm5 = _mm_packus_epi16(xmm5, xmm4); _mm_store_si128((__m128i*) dptr, xmm5); dptr += 4; } /* Finish off the remainder. */ if (pixels) { pstatus_t status; status = generic->alphaComp_argb((const BYTE*) sptr1, src1Step, (const BYTE*) sptr2, src2Step, (BYTE*) dptr, dstStep, pixels, 1); if (status != PRIMITIVES_SUCCESS) return status; sptr1 += pixels; sptr2 += pixels; dptr += pixels; } /* Jump to next row. */ sptr1 += src1Jump; sptr2 += src2Jump; dptr += dstJump; } return PRIMITIVES_SUCCESS; }
static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64; __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; __m256i srcReg32b11, srcReg32b12, filtersReg32; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 16 bits (first and second byte) // across 256 bit register firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; // load 16 bytes 7 times in stride of src_pitch srcReg32b1 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr))); srcReg32b2 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); srcReg32b3 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); srcReg32b4 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); srcReg32b5 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); srcReg32b6 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); srcReg32b7 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); // have each consecutive loads on the same 256 register srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, _mm256_castsi256_si128(srcReg32b2), 1); srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, _mm256_castsi256_si128(srcReg32b3), 1); srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, _mm256_castsi256_si128(srcReg32b4), 1); srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, _mm256_castsi256_si128(srcReg32b5), 1); srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, _mm256_castsi256_si128(srcReg32b6), 1); srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, _mm256_castsi256_si128(srcReg32b7), 1); // merge every two consecutive registers except the last one srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); // save srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); // save srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); // save srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); // save srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); for (i = output_height; i > 1; i-=2) { // load the last 2 loads of 16 bytes and have every two // consecutive loads in the same 256 bit register srcReg32b8 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, _mm256_castsi256_si128(srcReg32b8), 1); srcReg32b9 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, _mm256_castsi256_si128(srcReg32b9), 1); // merge every two consecutive registers // save srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); // multiply 2 adjacent elements with the filter and add the result srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); // multiply 2 adjacent elements with the filter and add the result srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, _mm256_min_epi16(srcReg32b8, srcReg32b12)); srcReg32b10 = _mm256_adds_epi16(srcReg32b10, _mm256_max_epi16(srcReg32b8, srcReg32b12)); // multiply 2 adjacent elements with the filter and add the result srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); // multiply 2 adjacent elements with the filter and add the result srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); // add and saturate the results together srcReg32b1 = _mm256_adds_epi16(srcReg32b1, _mm256_min_epi16(srcReg32b8, srcReg32b12)); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, _mm256_max_epi16(srcReg32b8, srcReg32b12)); srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); // shift by 7 bit each 16 bit srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); src_ptr+=src_stride; // save 16 bytes _mm_store_si128((__m128i*)output_ptr, _mm256_castsi256_si128(srcReg32b1)); // save the next 16 bits _mm_store_si128((__m128i*)(output_ptr+out_pitch), _mm256_extractf128_si256(srcReg32b1, 1)); output_ptr+=dst_stride; // save part of the registers for next strides srcReg32b10 = srcReg32b11; srcReg32b1 = srcReg32b3; srcReg32b11 = srcReg32b2; srcReg32b3 = srcReg32b5; srcReg32b2 = srcReg32b4; srcReg32b5 = srcReg32b7; srcReg32b7 = srcReg32b9; } if (i > 0) { __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; // load the last 16 bytes srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the last 2 results together srcRegFilt4 = _mm_unpacklo_epi8( _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); srcRegFilt7 = _mm_unpackhi_epi8( _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), _mm256_castsi256_si128(firstFilters)); srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), _mm256_castsi256_si128(firstFilters)); srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); // multiply 2 adjacent elements with the filter and add the result srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), _mm256_castsi256_si128(secondFilters)); srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), _mm256_castsi256_si128(secondFilters)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), _mm256_castsi256_si128(thirdFilters)); srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7)); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64)); // shift by 7 bit each 16 bit srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); // save 16 bytes _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); } }
mlib_status mlib_VideoColorBGR2JFIFYCC444_S16_aligned( mlib_s16 *y, mlib_s16 *cb, mlib_s16 *cr, const mlib_s16 *bgr, mlib_s32 n) { /* 0.299*32768 */ const __m128i x_c11 = _mm_set1_epi16(9798); /* 0.587*32768 */ const __m128i x_c12 = _mm_set1_epi16(19235); /* 0.114*32768 */ const __m128i x_c13 = _mm_set1_epi16(3735); /* -0.16874*32768 */ const __m128i x_c21 = _mm_set1_epi16(-5529); /* -0.33126*32768 */ const __m128i x_c22 = _mm_set1_epi16(-10855); /* 0.5*32768 */ const __m128i x_c23 = _mm_set1_epi16(16384); /* 0.5*32768 */ const __m128i x_c31 = x_c23; /* -0.41869*32768 */ const __m128i x_c32 = _mm_set1_epi16(-13720); /* -0.08131*32768 */ const __m128i x_c33 = _mm_set1_epi16(-2664); /* 2048 */ const __m128i x_coff = _mm_set1_epi16(2048 << 2); const __m128i x_zero = _mm_setzero_si128(); __m128i x_bgr0, x_bgr1, x_bgr2, x_r, x_g, x_b; __m128i x_y, x_cb, x_cr; __m128i x_t0, x_t1, x_t2, x_t3, x_t4, x_t5; __m128i *px_y, *px_cb, *px_cr, *px_bgr; mlib_d64 fr, fg, fb, fy, fcb, fcr; mlib_s32 i; px_y = (__m128i *)y; px_cb = (__m128i *)cb; px_cr = (__m128i *)cr; px_bgr = (__m128i *)bgr; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (n - 8); i += 8) { x_bgr0 = _mm_load_si128(px_bgr++); x_bgr0 = _mm_slli_epi16(x_bgr0, 3); x_bgr1 = _mm_load_si128(px_bgr++); x_bgr1 = _mm_slli_epi16(x_bgr1, 3); x_bgr2 = _mm_load_si128(px_bgr++); x_bgr2 = _mm_slli_epi16(x_bgr2, 3); SeparateBGR48_S16; x_t0 = _mm_mulhi_epi16(x_r, x_c11); x_t1 = _mm_mulhi_epi16(x_g, x_c12); x_t2 = _mm_mulhi_epi16(x_b, x_c13); x_y = _mm_add_epi16(x_t0, x_t1); x_y = _mm_add_epi16(x_y, x_t2); x_t0 = _mm_mulhi_epi16(x_r, x_c21); x_t1 = _mm_mulhi_epi16(x_g, x_c22); x_t2 = _mm_mulhi_epi16(x_b, x_c23); x_cb = _mm_add_epi16(x_t0, x_t1); x_cb = _mm_add_epi16(x_cb, x_coff); x_cb = _mm_add_epi16(x_cb, x_t2); x_t0 = _mm_mulhi_epi16(x_r, x_c31); x_t1 = _mm_mulhi_epi16(x_g, x_c32); x_t2 = _mm_mulhi_epi16(x_b, x_c33); x_cr = _mm_add_epi16(x_t0, x_t1); x_cr = _mm_add_epi16(x_cr, x_coff); x_cr = _mm_add_epi16(x_cr, x_t2); /* save */ x_y = _mm_srli_epi16(x_y, 2); x_cb = _mm_srli_epi16(x_cb, 2); x_cr = _mm_srli_epi16(x_cr, 2); _mm_store_si128(px_y++, x_y); _mm_store_si128(px_cb++, x_cb); _mm_store_si128(px_cr++, x_cr); } if (i <= (n - 4)) { x_bgr0 = _mm_load_si128(px_bgr++); x_bgr0 = _mm_slli_epi16(x_bgr0, 3); x_bgr1 = _mm_loadl_epi64(px_bgr); x_bgr1 = _mm_slli_epi16(x_bgr1, 3); px_bgr = (__m128i *)((__m64 *)px_bgr + 1); SeparateBGR24_S16; x_t0 = _mm_mulhi_epi16(x_r, x_c11); x_t1 = _mm_mulhi_epi16(x_g, x_c12); x_t2 = _mm_mulhi_epi16(x_b, x_c13); x_y = _mm_add_epi16(x_t0, x_t1); x_y = _mm_add_epi16(x_y, x_t2); x_t0 = _mm_mulhi_epi16(x_r, x_c21); x_t1 = _mm_mulhi_epi16(x_g, x_c22); x_t2 = _mm_mulhi_epi16(x_b, x_c23); x_cb = _mm_add_epi16(x_t0, x_t1); x_cb = _mm_add_epi16(x_cb, x_coff); x_cb = _mm_add_epi16(x_cb, x_t2); x_t0 = _mm_mulhi_epi16(x_r, x_c31); x_t1 = _mm_mulhi_epi16(x_g, x_c32); x_t2 = _mm_mulhi_epi16(x_b, x_c33); x_cr = _mm_add_epi16(x_t0, x_t1); x_cr = _mm_add_epi16(x_cr, x_coff); x_cr = _mm_add_epi16(x_cr, x_t2); /* save */ x_y = _mm_srli_epi16(x_y, 2); x_cb = _mm_srli_epi16(x_cb, 2); x_cr = _mm_srli_epi16(x_cr, 2); _mm_storel_epi64(px_y, x_y); px_y = (__m128i *)((__m64 *)px_y + 1); _mm_storel_epi64(px_cb, x_cb); px_cb = (__m128i *)((__m64 *)px_cb + 1); _mm_storel_epi64(px_cr, x_cr); px_cr = (__m128i *)((__m64 *)px_cr + 1); i += 4; } for (; i <= (n - 1); i++) { fb = bgr[3 * i]; fg = bgr[3 * i + 1]; fr = bgr[3 * i + 2]; fy = 0.29900f * fr + 0.58700f * fg + 0.11400f * fb; fcb = -0.16874f * fr - 0.33126f * fg + 0.50000f * fb + 2048; fcr = 0.50000f * fr - 0.41869f * fg - 0.08131f * fb + 2048; y[i] = (mlib_s16)fy; cb[i] = (mlib_s16)fcb; cr[i] = (mlib_s16)fcr; } return (MLIB_SUCCESS); }
QT_BEGIN_NAMESPACE // Convert a scanline of RGB888 (src) to RGB32 (dst) // src must be at least len * 3 bytes // dst must be at least len * 4 bytes Q_GUI_EXPORT void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 *dst, const uchar *src, int len) { quint32 *const end = dst + len; // Prologue, align dst to 16 bytes. The alignment is done on dst because it has 4 store() // for each 3 load() of src. const int offsetToAlignOn16Bytes = (4 - ((reinterpret_cast<quintptr>(dst) >> 2) & 0x3)) & 0x3; const int prologLength = qMin(len, offsetToAlignOn16Bytes); for (int i = 0; i < prologLength; ++i) { *dst++ = qRgb(src[0], src[1], src[2]); src += 3; } // Mask the 4 first colors of the RGB888 vector const __m128i shuffleMask = _mm_set_epi8(char(0xff), 9, 10, 11, char(0xff), 6, 7, 8, char(0xff), 3, 4, 5, char(0xff), 0, 1, 2); // Mask the 4 last colors of a RGB888 vector with an offset of 1 (so the last 3 bytes are RGB) const __m128i shuffleMaskEnd = _mm_set_epi8(char(0xff), 13, 14, 15, char(0xff), 10, 11, 12, char(0xff), 7, 8, 9, char(0xff), 4, 5, 6); // Mask to have alpha = 0xff const __m128i alphaMask = _mm_set1_epi32(0xff000000); __m128i *inVectorPtr = (__m128i *)src; __m128i *dstVectorPtr = (__m128i *)dst; const int simdRoundCount = (len - prologLength) / 16; // one iteration in the loop converts 16 pixels for (int i = 0; i < simdRoundCount; ++i) { /* RGB888 has 5 pixels per vector, + 1 byte from the next pixel. The idea here is to load vectors of RGB888 and use palignr to select a vector out of two vectors. After 3 loads of RGB888 and 3 stores of RGB32, we have 4 pixels left in the last vector of RGB888, we can mask it directly to get a last store or RGB32. After that, the first next byte is a R, and we can loop for the next 16 pixels. The conversion itself is done with a byte permutation (pshufb). */ __m128i firstSrcVector = _mm_lddqu_si128(inVectorPtr); __m128i outputVector = _mm_shuffle_epi8(firstSrcVector, shuffleMask); _mm_store_si128(dstVectorPtr, _mm_or_si128(outputVector, alphaMask)); ++inVectorPtr; ++dstVectorPtr; // There are 4 unused bytes left in srcVector, we need to load the next 16 bytes // and load the next input with palignr __m128i secondSrcVector = _mm_lddqu_si128(inVectorPtr); __m128i srcVector = _mm_alignr_epi8(secondSrcVector, firstSrcVector, 12); outputVector = _mm_shuffle_epi8(srcVector, shuffleMask); _mm_store_si128(dstVectorPtr, _mm_or_si128(outputVector, alphaMask)); ++inVectorPtr; ++dstVectorPtr; firstSrcVector = secondSrcVector; // We now have 8 unused bytes left in firstSrcVector secondSrcVector = _mm_lddqu_si128(inVectorPtr); srcVector = _mm_alignr_epi8(secondSrcVector, firstSrcVector, 8); outputVector = _mm_shuffle_epi8(srcVector, shuffleMask); _mm_store_si128(dstVectorPtr, _mm_or_si128(outputVector, alphaMask)); ++inVectorPtr; ++dstVectorPtr; // There are now 12 unused bytes in firstSrcVector. // We can mask them directly, almost there. outputVector = _mm_shuffle_epi8(secondSrcVector, shuffleMaskEnd); _mm_store_si128(dstVectorPtr, _mm_or_si128(outputVector, alphaMask)); ++dstVectorPtr; } src = (uchar *)inVectorPtr; dst = (quint32 *)dstVectorPtr; while (dst != end) { *dst++ = qRgb(src[0], src[1], src[2]); src += 3; } }
test (__m128i *p, __m128i a) { return _mm_store_si128 (p, a); }
static int blake64_compress( state * state, const u8 * datablock ) { __m128i row1a,row1b; __m128i row2a,row2b; __m128i row3a,row3b; __m128i row4a,row4b; __m128i buf1a,buf2a; static const u8 rot16[16] = {2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9}; __m128i r16 = _mm_load_si128((__m128i*)rot16); u64 m[16]; u64 y[16]; /* constants and permutation */ static const int sig[][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , }; static const u64 z[16] = { 0x243F6A8885A308D3ULL,0x13198A2E03707344ULL, 0xA4093822299F31D0ULL,0x082EFA98EC4E6C89ULL, 0x452821E638D01377ULL,0xBE5466CF34E90C6CULL, 0xC0AC29B7C97C50DDULL,0x3F84D5B5B5470917ULL, 0x9216D5D98979FB1BULL,0xD1310BA698DFB5ACULL, 0x2FFD72DBD01ADFB7ULL,0xB8E1AFED6A267E96ULL, 0xBA7C9045F12C7F99ULL,0x24A19947B3916CF7ULL, 0x0801F2E2858EFC16ULL,0x636920D871574E69ULL }; /* get message */ m[ 0] = U8TO64(datablock + 0); m[ 1] = U8TO64(datablock + 8); m[ 2] = U8TO64(datablock + 16); m[ 3] = U8TO64(datablock + 24); m[ 4] = U8TO64(datablock + 32); m[ 5] = U8TO64(datablock + 40); m[ 6] = U8TO64(datablock + 48); m[ 7] = U8TO64(datablock + 56); m[ 8] = U8TO64(datablock + 64); m[ 9] = U8TO64(datablock + 72); m[10] = U8TO64(datablock + 80); m[11] = U8TO64(datablock + 88); m[12] = U8TO64(datablock + 96); m[13] = U8TO64(datablock +104); m[14] = U8TO64(datablock +112); m[15] = U8TO64(datablock +120); row1b = _mm_set_epi64((__m64)state->h[3],(__m64)state->h[2]); row1a = _mm_set_epi64((__m64)state->h[1],(__m64)state->h[0]); row2b = _mm_set_epi64((__m64)state->h[7],(__m64)state->h[6]); row2a = _mm_set_epi64((__m64)state->h[5],(__m64)state->h[4]); row3b = _mm_set_epi64((__m64)0x082EFA98EC4E6C89ULL, (__m64)0xA4093822299F31D0ULL); row3a = _mm_set_epi64((__m64)0x13198A2E03707344ULL, (__m64)0x243F6A8885A308D3ULL); if (state->nullt) { row4b = _mm_set_epi64((__m64)0x3F84D5B5B5470917ULL, (__m64)0xC0AC29B7C97C50DDULL); row4a = _mm_set_epi64((__m64)0xBE5466CF34E90C6CULL, (__m64)0x452821E638D01377ULL); } else { row4b = _mm_set_epi64((__m64)(0x3F84D5B5B5470917ULL^state->t[1]), (__m64)(0xC0AC29B7C97C50DDULL^state->t[1])); row4a = _mm_set_epi64((__m64)(0xBE5466CF34E90C6CULL^state->t[0]), (__m64)(0x452821E638D01377ULL^state->t[0])); } /* initialization ok (beware of bug on Celeron and P4!) */ #define round(r)\ /* column step */\ /***************************************************/\ /* high-order side: words 0, 1, 4, 5, 8, 9, 12, 13 */ \ buf2a = _mm_set_epi64( (__m64)m[sig[r][ 2]], (__m64)m[sig[r][ 0]] ); \ buf1a = _mm_set_epi64( (__m64)z[sig[r][ 3]], (__m64)z[sig[r][ 1]] ); \ buf1a = _mm_xor_si128( buf1a, buf2a ); \ row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a ); \ row4a = _mm_xor_si128( row4a, row1a ); \ row4a = _mm_shuffle_epi32(row4a, 0xB1); \ row3a = _mm_add_epi64( row3a, row4a ); \ row2a = _mm_xor_si128( row2a, row3a ); \ row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 25 ),_mm_slli_epi64( row2a, 39 )); \ \ buf2a = _mm_set_epi64( (__m64)m[sig[r][ 3]], (__m64)m[sig[r][ 1]] ); \ buf1a = _mm_set_epi64( (__m64)z[sig[r][ 2]], (__m64)z[sig[r][ 0]] ); \ buf1a = _mm_xor_si128( buf1a, buf2a ); \ row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a ); \ row4a = _mm_xor_si128( row4a, row1a ); \ row4a = _mm_shuffle_epi8(row4a, r16); \ row3a = _mm_add_epi64( row3a, row4a ); \ row2a = _mm_xor_si128( row2a, row3a ); \ row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 11 ),_mm_slli_epi64( row2a, 53 )); \ \ /* same stuff for low-order side */\ buf2a = _mm_set_epi64( (__m64)m[sig[r][ 6]], (__m64)m[sig[r][ 4]] );\ buf1a = _mm_set_epi64( (__m64)z[sig[r][ 7]], (__m64)z[sig[r][ 5]] );\ buf1a = _mm_xor_si128( buf1a, buf2a ); \ row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b ); \ row4b = _mm_xor_si128( row4b, row1b ); \ row4b = _mm_shuffle_epi32(row4b, 0xB1); \ row3b = _mm_add_epi64( row3b, row4b ); \ row2b = _mm_xor_si128( row2b, row3b ); \ row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 25 ),_mm_slli_epi64( row2b, 39 )); \ \ buf2a = _mm_set_epi64( (__m64)m[sig[r][ 7]], (__m64)m[sig[r][ 5]] ); \ buf1a = _mm_set_epi64( (__m64)z[sig[r][ 6]], (__m64)z[sig[r][ 4]] ); \ buf1a = _mm_xor_si128( buf1a, buf2a ); \ row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b ); \ row4b = _mm_xor_si128( row4b, row1b ); \ row4b = _mm_shuffle_epi8(row4b, r16); \ row3b = _mm_add_epi64( row3b, row4b ); \ row2b = _mm_xor_si128( row2b, row3b ); \ row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 11 ),_mm_slli_epi64( row2b, 53 )); \ \ /* shuffle */\ _mm_store_si128( 0+ (__m128i *)y, row4a); \ _mm_store_si128( 1+ (__m128i *)y, row4b); \ row4a = row3a;\ row3a = row3b;\ row3b = row4a;\ row4a = _mm_set_epi64( (__m64)y[0], (__m64)y[3] );\ row4b = _mm_set_epi64( (__m64)y[2], (__m64)y[1] );\ _mm_store_si128( 0+ (__m128i *)y, row2a); \ _mm_store_si128( 1+ (__m128i *)y, row2b); \ row2a = _mm_set_epi64( (__m64)y[2], (__m64)y[1] ); \ row2b = _mm_set_epi64( (__m64)y[0], (__m64)y[3] ); \ /* diagonal step */\ /***************************************************/\ /* high-order side: words 0, 1, 4, 5, 8, 9, 12, 13 */\ buf2a = _mm_set_epi64( (__m64)m[sig[r][10]], (__m64)m[sig[r][ 8]] );\ buf1a = _mm_set_epi64( (__m64)z[sig[r][11]], (__m64)z[sig[r][ 9]] );\ buf1a = _mm_xor_si128( buf1a, buf2a );\ row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a );\ row4a = _mm_xor_si128( row4a, row1a ); \ row4a = _mm_shuffle_epi32(row4a, 0xB1); \ row3a = _mm_add_epi64( row3a, row4a ); \ row2a = _mm_xor_si128( row2a, row3a ); \ row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 25 ),_mm_slli_epi64( row2a, 39 )); \ \ buf2a = _mm_set_epi64( (__m64)m[sig[r][11]], (__m64)m[sig[r][ 9]] );\ buf1a = _mm_set_epi64( (__m64)z[sig[r][10]], (__m64)z[sig[r][ 8]] );\ buf1a = _mm_xor_si128( buf1a, buf2a );\ row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a );\ row4a = _mm_xor_si128( row4a, row1a ); \ row4a = _mm_shuffle_epi8(row4a, r16); \ row3a = _mm_add_epi64( row3a, row4a ); \ row2a = _mm_xor_si128( row2a, row3a ); \ row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 11 ),_mm_slli_epi64( row2a, 53 )); \ \ /* same stuff for low-order side */\ buf2a = _mm_set_epi64( (__m64)m[sig[r][14]], (__m64)m[sig[r][12]] );\ buf1a = _mm_set_epi64( (__m64)z[sig[r][15]], (__m64)z[sig[r][13]] );\ buf1a = _mm_xor_si128( buf1a, buf2a );\ row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b );\ row4b = _mm_xor_si128( row4b, row1b ); \ buf2a = _mm_set_epi64( (__m64)m[sig[r][15]], (__m64)m[sig[r][13]] );\ row4b = _mm_shuffle_epi32(row4b, 0xB1); \ row3b = _mm_add_epi64( row3b, row4b ); \ row2b = _mm_xor_si128( row2b, row3b ); \ buf1a = _mm_set_epi64( (__m64)z[sig[r][14]], (__m64)z[sig[r][12]] );\ row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 25 ),_mm_slli_epi64( row2b, 39 )); \ \ buf1a = _mm_xor_si128( buf1a, buf2a );\ row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b );\ row4b = _mm_xor_si128( row4b, row1b ); \ row4b = _mm_shuffle_epi8(row4b, r16); \ row3b = _mm_add_epi64( row3b, row4b ); \ row2b = _mm_xor_si128( row2b, row3b ); \ row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 11 ),_mm_slli_epi64( row2b, 53 )); \ \ /* shuffle back */\ buf1a = row3a;\ row3a = row3b;\ row3b = buf1a;\ _mm_store_si128( 0+ (__m128i *)y, row2a); \ _mm_store_si128( 1+ (__m128i *)y, row2b); \ row2a = _mm_set_epi64( (__m64)y[0], (__m64)y[3] ); \ row2b = _mm_set_epi64( (__m64)y[2], (__m64)y[1] ); \ _mm_store_si128( 0+ (__m128i *)y, row4a); \ _mm_store_si128( 1+ (__m128i *)y, row4b); \ row4a = _mm_set_epi64( (__m64)y[2], (__m64)y[1] ); \ row4b = _mm_set_epi64( (__m64)y[0], (__m64)y[3] ); \ \ round(0); round(1); round(2); round(3); round(4); round(5); round(6); round(7); round(8); round(9); round(10); round(11); round(12); round(13); row1a = _mm_xor_si128(row3a,row1a); row1b = _mm_xor_si128(row3b,row1b); _mm_store_si128( (__m128i *)m, row1a); state->h[0] ^= m[ 0]; state->h[1] ^= m[ 1]; _mm_store_si128( (__m128i *)m, row1b); state->h[2] ^= m[ 0]; state->h[3] ^= m[ 1]; row2a = _mm_xor_si128(row4a,row2a); row2b = _mm_xor_si128(row4b,row2b); _mm_store_si128( (__m128i *)m, row2a); state->h[4] ^= m[ 0]; state->h[5] ^= m[ 1]; _mm_store_si128( (__m128i *)m, row2b); state->h[6] ^= m[ 0]; state->h[7] ^= m[ 1]; return 0; }
void GetMinMaxColors_Intrinsics( const byte *colorBlock, byte *minColor, byte *maxColor ) { __m128i t0, t1, t3, t4, t6, t7; // get bounding box // ---------------- // load the first row t0 = _mm_load_si128 ( (__m128i*) colorBlock ); t1 = _mm_load_si128 ( (__m128i*) colorBlock ); __m128i t16 = _mm_load_si128 ( (__m128i*) (colorBlock+16) ); // Minimum of Packed Unsigned Byte Integers t0 = _mm_min_epu8 ( t0, t16); // Maximum of Packed Unsigned Byte Integers t1 = _mm_max_epu8 ( t1, t16); __m128i t32 = _mm_load_si128 ( (__m128i*) (colorBlock+32) ); t0 = _mm_min_epu8 ( t0, t32); t1 = _mm_max_epu8 ( t1, t32); __m128i t48 = _mm_load_si128 ( (__m128i*) (colorBlock+48) ); t0 = _mm_min_epu8 ( t0, t48); t1 = _mm_max_epu8 ( t1, t48); // Shuffle Packed Doublewords t3 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t4 = _mm_shuffle_epi32( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t0 = _mm_min_epu8 ( t0, t3); t1 = _mm_max_epu8 ( t1, t4); // Shuffle Packed Low Words t6 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t7 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t0 = _mm_min_epu8 ( t0, t6); t1 = _mm_max_epu8 ( t1, t7); // inset the bounding box // ---------------------- // Unpack Low Data //__m128i t66 = _mm_set1_epi8( 0 ); __m128i t66 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_0 ); t0 = _mm_unpacklo_epi8(t0, t66); t1 = _mm_unpacklo_epi8(t1, t66); // copy (movdqa) //__m128i t2 = _mm_load_si128 ( &t1 ); __m128i t2 = t1; // Subtract Packed Integers t2 = _mm_sub_epi16(t2, t0); // Shift Packed Data Right Logical t2 = _mm_srli_epi16(t2, INSET_SHIFT); // Add Packed Integers t0 = _mm_add_epi16(t0, t2); t1 = _mm_sub_epi16(t1, t2); // Pack with Unsigned Saturation t0 = _mm_packus_epi16(t0, t0); t1 = _mm_packus_epi16(t1, t1); // store bounding box extents // -------------------------- _mm_store_si128 ( (__m128i*) minColor, t0 ); _mm_store_si128 ( (__m128i*) maxColor, t1 ); }
static inline void ixgbe_rxq_rearm(struct ixgbe_rx_queue *rxq) { int i; uint16_t rx_id; volatile union ixgbe_adv_rx_desc *rxdp; struct ixgbe_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start]; struct rte_mbuf *mb0, *mb1; __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM, RTE_PKTMBUF_HEADROOM); __m128i dma_addr0, dma_addr1; const __m128i hba_msk = _mm_set_epi64x(0, UINT64_MAX); rxdp = rxq->rx_ring + rxq->rxrearm_start; /* Pull 'n' more MBUFs into the software ring */ if (rte_mempool_get_bulk(rxq->mb_pool, (void *)rxep, RTE_IXGBE_RXQ_REARM_THRESH) < 0) { if (rxq->rxrearm_nb + RTE_IXGBE_RXQ_REARM_THRESH >= rxq->nb_rx_desc) { dma_addr0 = _mm_setzero_si128(); for (i = 0; i < RTE_IXGBE_DESCS_PER_LOOP; i++) { rxep[i].mbuf = &rxq->fake_mbuf; _mm_store_si128((__m128i *)&rxdp[i].read, dma_addr0); } } rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += RTE_IXGBE_RXQ_REARM_THRESH; return; } /* Initialize the mbufs in vector, process 2 mbufs in one loop */ for (i = 0; i < RTE_IXGBE_RXQ_REARM_THRESH; i += 2, rxep += 2) { __m128i vaddr0, vaddr1; uintptr_t p0, p1; mb0 = rxep[0].mbuf; mb1 = rxep[1].mbuf; /* * Flush mbuf with pkt template. * Data to be rearmed is 6 bytes long. * Though, RX will overwrite ol_flags that are coming next * anyway. So overwrite whole 8 bytes with one load: * 6 bytes of rearm_data plus first 2 bytes of ol_flags. */ p0 = (uintptr_t)&mb0->rearm_data; *(uint64_t *)p0 = rxq->mbuf_initializer; p1 = (uintptr_t)&mb1->rearm_data; *(uint64_t *)p1 = rxq->mbuf_initializer; /* load buf_addr(lo 64bit) and buf_physaddr(hi 64bit) */ vaddr0 = _mm_loadu_si128((__m128i *)&(mb0->buf_addr)); vaddr1 = _mm_loadu_si128((__m128i *)&(mb1->buf_addr)); /* convert pa to dma_addr hdr/data */ dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0); dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1); /* add headroom to pa values */ dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room); dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room); /* set Header Buffer Address to zero */ dma_addr0 = _mm_and_si128(dma_addr0, hba_msk); dma_addr1 = _mm_and_si128(dma_addr1, hba_msk); /* flush desc with pa dma_addr */ _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0); _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1); } rxq->rxrearm_start += RTE_IXGBE_RXQ_REARM_THRESH; if (rxq->rxrearm_start >= rxq->nb_rx_desc) rxq->rxrearm_start = 0; rxq->rxrearm_nb -= RTE_IXGBE_RXQ_REARM_THRESH; rx_id = (uint16_t) ((rxq->rxrearm_start == 0) ? (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); /* Update the tail pointer on the NIC */ IXGBE_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id); }
void EmitColorIndices_Intrinsics( const byte *colorBlock, const byte *minColor, const byte *maxColor, byte *&outData ) { ALIGN16( byte color0[16] ); ALIGN16( byte color1[16] ); ALIGN16( byte color2[16] ); ALIGN16( byte color3[16] ); ALIGN16( byte result[16] ); // mov esi, maxColor // mov edi, minColor __m128i t0, t1, t2, t3, t4, t5, t6, t7; t7 = _mm_setzero_si128(); //t7 = _mm_xor_si128(t7, t7); _mm_store_si128 ( (__m128i*) &result, t7 ); //t0 = _mm_load_si128 ( (__m128i*) maxColor ); t0 = _mm_cvtsi32_si128( *(int*)maxColor); // Bitwise AND __m128i tt = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_colorMask ); t0 = _mm_and_si128(t0, tt); t0 = _mm_unpacklo_epi8(t0, t7); t4 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 0, 3, 2, 3 )); t5 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 3, 1, 3, 3 )); t4 = _mm_srli_epi16(t4, 5); t5 = _mm_srli_epi16(t5, 6); // Bitwise Logical OR t0 = _mm_or_si128(t0, t4); t0 = _mm_or_si128(t0, t5); // t0 contains color0 in 565 //t1 = _mm_load_si128 ( (__m128i*) minColor ); t1 = _mm_cvtsi32_si128( *(int*)minColor); t1 = _mm_and_si128(t1, tt); t1 = _mm_unpacklo_epi8(t1, t7); t4 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 0, 3, 2, 3 )); t5 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 3, 1, 3, 3 )); t4 = _mm_srli_epi16(t4, 5); t5 = _mm_srli_epi16(t5, 6); t1 = _mm_or_si128(t1, t4); t1 = _mm_or_si128(t1, t5); // t1 contains color1 in 565 t2 = t0; t2 = _mm_packus_epi16(t2, t7); t2 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color0, t2 ); t6 = t0; t6 = _mm_add_epi16(t6, t0); t6 = _mm_add_epi16(t6, t1); // Multiply Packed Signed Integers and Store High Result __m128i tw3 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_div_by_3 ); t6 = _mm_mulhi_epi16(t6, tw3); t6 = _mm_packus_epi16(t6, t7); t6 = _mm_shuffle_epi32( t6, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color2, t6 ); t3 = t1; t3 = _mm_packus_epi16(t3, t7); t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color1, t3 ); t1 = _mm_add_epi16(t1, t1); t0 = _mm_add_epi16(t0, t1); t0 = _mm_mulhi_epi16(t0, tw3); t0 = _mm_packus_epi16(t0, t7); t0 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color3, t0 ); __m128i w0 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_0); __m128i w1 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_1); __m128i w2 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_2); // mov eax, 32 // mov esi, colorBlock int x = 32; //const byte *c = colorBlock; while (x >= 0) { t3 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+0)); t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 2, 1, 3 )); t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+8)); t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 )); t0 = t3; t6 = t5; // Compute Sum of Absolute Difference __m128i c0 = _mm_load_si128 ( (__m128i*) color0 ); t0 = _mm_sad_epu8(t0, c0); t6 = _mm_sad_epu8(t6, c0); // Pack with Signed Saturation t0 = _mm_packs_epi32 (t0, t6); t1 = t3; t6 = t5; __m128i c1 = _mm_load_si128 ( (__m128i*) color1 ); t1 = _mm_sad_epu8(t1, c1); t6 = _mm_sad_epu8(t6, c1); t1 = _mm_packs_epi32 (t1, t6); t2 = t3; t6 = t5; __m128i c2 = _mm_load_si128 ( (__m128i*) color2 ); t2 = _mm_sad_epu8(t2, c2); t6 = _mm_sad_epu8(t6, c2); t2 = _mm_packs_epi32 (t2, t6); __m128i c3 = _mm_load_si128 ( (__m128i*) color3 ); t3 = _mm_sad_epu8(t3, c3); t5 = _mm_sad_epu8(t5, c3); t3 = _mm_packs_epi32 (t3, t5); t4 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+16)); t4 = _mm_shuffle_epi32( t4, R_SHUFFLE_D( 0, 2, 1, 3 )); t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+24)); t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 )); t6 = t4; t7 = t5; t6 = _mm_sad_epu8(t6, c0); t7 = _mm_sad_epu8(t7, c0); t6 = _mm_packs_epi32 (t6, t7); t0 = _mm_packs_epi32 (t0, t6); // d0 t6 = t4; t7 = t5; t6 = _mm_sad_epu8(t6, c1); t7 = _mm_sad_epu8(t7, c1); t6 = _mm_packs_epi32 (t6, t7); t1 = _mm_packs_epi32 (t1, t6); // d1 t6 = t4; t7 = t5; t6 = _mm_sad_epu8(t6, c2); t7 = _mm_sad_epu8(t7, c2); t6 = _mm_packs_epi32 (t6, t7); t2 = _mm_packs_epi32 (t2, t6); // d2 t4 = _mm_sad_epu8(t4, c3); t5 = _mm_sad_epu8(t5, c3); t4 = _mm_packs_epi32 (t4, t5); t3 = _mm_packs_epi32 (t3, t4); // d3 t7 = _mm_load_si128 ( (__m128i*) result ); t7 = _mm_slli_epi32( t7, 16); t4 = t0; t5 = t1; // Compare Packed Signed Integers for Greater Than t0 = _mm_cmpgt_epi16(t0, t3); // b0 t1 = _mm_cmpgt_epi16(t1, t2); // b1 t4 = _mm_cmpgt_epi16(t4, t2); // b2 t5 = _mm_cmpgt_epi16(t5, t3); // b3 t2 = _mm_cmpgt_epi16(t2, t3); // b4 t4 = _mm_and_si128(t4, t1); // x0 t5 = _mm_and_si128(t5, t0); // x1 t2 = _mm_and_si128(t2, t0); // x2 t4 = _mm_or_si128(t4, t5); t2 = _mm_and_si128(t2, w1); t4 = _mm_and_si128(t4, w2); t2 = _mm_or_si128(t2, t4); t5 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 2, 3, 0, 1 )); // Unpack Low Data t2 = _mm_unpacklo_epi16 ( t2, w0); t5 = _mm_unpacklo_epi16 ( t5, w0); //t5 = _mm_slli_si128 ( t5, 8); t5 = _mm_slli_epi32( t5, 8); t7 = _mm_or_si128(t7, t5); t7 = _mm_or_si128(t7, t2); _mm_store_si128 ( (__m128i*) &result, t7 ); x -=32; } t4 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 1, 2, 3, 0 )); t5 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 2, 3, 0, 1 )); t6 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 3, 0, 1, 2 )); t4 = _mm_slli_epi32 ( t4, 2); t5 = _mm_slli_epi32 ( t5, 4); t6 = _mm_slli_epi32 ( t6, 6); t7 = _mm_or_si128(t7, t4); t7 = _mm_or_si128(t7, t5); t7 = _mm_or_si128(t7, t6); //_mm_store_si128 ( (__m128i*) outData, t7 ); int r = _mm_cvtsi128_si32 (t7); memcpy(outData, &r, 4); // Anything better ? outData += 4; }
ScoreKeyValue& operator=(const ScoreKeyValue& other) { _mm_store_si128(&as_m128i, other.as_m128i); return *this; }
ScoreKeyValue(const ScoreKeyValue& other) { static_assert(sizeof(ScoreKeyValue) == sizeof(__m128i), "sizeof(ScoreKeyValue) should be equal to sizeof(__m128i)"); _mm_store_si128(&as_m128i, other.as_m128i); }
static inline int blake512_compress( state * state, const u8 * datablock ) { __m128i row1l; __m128i row2l; __m128i row3l; __m128i row4l; u64 row1hl, row1hh; u64 row2hl, row2hh; u64 row3hl, row3hh; u64 row4hl, row4hh; const __m128i r16 = _mm_setr_epi8(2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9); const __m128i u8to64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); union { __m128i u128[8]; u64 u64[16]; } m; __m128i t0, t1, t2, t3, t4, t5, t6, t7; u64 u0, u1, u2, u3; __m128i b0; u64 b1l, b1h; m.u128[0] = _mm_loadu_si128((__m128i*)(datablock + 0)); m.u128[1] = _mm_loadu_si128((__m128i*)(datablock + 16)); m.u128[2] = _mm_loadu_si128((__m128i*)(datablock + 32)); m.u128[3] = _mm_loadu_si128((__m128i*)(datablock + 48)); m.u128[4] = _mm_loadu_si128((__m128i*)(datablock + 64)); m.u128[5] = _mm_loadu_si128((__m128i*)(datablock + 80)); m.u128[6] = _mm_loadu_si128((__m128i*)(datablock + 96)); m.u128[7] = _mm_loadu_si128((__m128i*)(datablock + 112)); m.u128[0] = BSWAP64(m.u128[0]); m.u128[1] = BSWAP64(m.u128[1]); m.u128[2] = BSWAP64(m.u128[2]); m.u128[3] = BSWAP64(m.u128[3]); m.u128[4] = BSWAP64(m.u128[4]); m.u128[5] = BSWAP64(m.u128[5]); m.u128[6] = BSWAP64(m.u128[6]); m.u128[7] = BSWAP64(m.u128[7]); row1l = _mm_load_si128((__m128i*)&state->h[0]); row1hl = state->h[2]; row1hh = state->h[3]; row2l = _mm_load_si128((__m128i*)&state->h[4]); row2hl = state->h[6]; row2hh = state->h[7]; row3l = _mm_set_epi64x(0x13198A2E03707344ULL, 0x243F6A8885A308D3ULL); row3hl = 0xA4093822299F31D0ULL; row3hh = 0x082EFA98EC4E6C89ULL; row4l = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x452821E638D01377ULL); row4hl = 0xC0AC29B7C97C50DDULL; row4hh = 0x3F84D5B5B5470917ULL; if(!state->nullt) { row4l = _mm_xor_si128(row4l, _mm_set1_epi64x(state->t[0])); row4hl ^= state->t[1]; row4hh ^= state->t[1]; } ROUND( 0); ROUND( 1); ROUND( 2); ROUND( 3); ROUND( 4); ROUND( 5); ROUND( 6); ROUND( 7); ROUND( 8); ROUND( 9); ROUND(10); ROUND(11); ROUND(12); ROUND(13); ROUND(14); ROUND(15); row1l = _mm_xor_si128(row3l,row1l); row1hl ^= row3hl; row1hh ^= row3hh; _mm_store_si128((__m128i*)&state->h[0], _mm_xor_si128(row1l, _mm_load_si128((__m128i*)&state->h[0]))); state->h[2] ^= row1hl; state->h[3] ^= row1hh; row2l = _mm_xor_si128(row4l,row2l); row2hl ^= row4hl; row2hh ^= row4hh; _mm_store_si128((__m128i*)&state->h[4], _mm_xor_si128(row2l, _mm_load_si128((__m128i*)&state->h[4]))); state->h[6] ^= row2hl; state->h[7] ^= row2hh; return 0; }
void FileIconDrawGlass::Text(HDC hdc, PCTCHAR pcszText, const RECT &rc, eTextColor eColor, UINT uFlags) { if (!pcszText || !*pcszText) return; // Find out actual size of text int nChars = _tcslen(pcszText); uFlags |= DT_NOCLIP; int iX = rc.left; int iY = rc.top; int iXW = (rc.right - iX); int iYH = (rc.bottom - iY); RECT rcMin = rc; if (DrawText(hdcTextDIB, pcszText, nChars, &rcMin, uFlags | DT_CALCRECT)) { int iMinXW = rcMin.right - rcMin.left; int iMinYH = rcMin.bottom - rcMin.top; if (iMinXW < iXW) { if (uFlags & DT_CENTER) { iX += (iXW - iMinXW)/2; uFlags &= ~DT_CENTER; } else if (uFlags & DT_RIGHT) { iX += (iXW - iMinXW); uFlags &= ~DT_RIGHT; } iXW = iMinXW; } if (iMinYH < iYH) { if (uFlags & DT_SINGLELINE) { if (uFlags & DT_VCENTER) { iY += (iYH - iMinYH)/2; uFlags &= ~DT_VCENTER; } else if (uFlags & DT_BOTTOM) { iY += (iYH - iMinYH); uFlags &= ~DT_BOTTOM; } } iYH = iMinYH; } } iXW += 2; // NB: +2 'cause we want an extra pixel at the border so that the font smoothing will look bette! iYH += 2; // Ensure we have a big enough DIB to draw the text to if ((iXW > iTextDIBXW) || (iYH > iTextDIBYH)) CreateTextDIB(iXW, iYH); if (!hbmpTextDIB) return; // Select color ieBGRA clr; switch (eColor) { case eFileName: clr = clrFileName; break; case eComment: clr = clrComment; break; case eFileInfo: clr = clrFileInfo; break; default: clr = ieBGRA(0,0,0); break; } clr.A = 0xFF - clrBkg.A; // Draw the text to in-memory DIB RECT rcTextDIB = { 0, 0, iXW, iYH }; FillRect(hdcTextDIB, &rcTextDIB, hbrBkg); rcTextDIB.left++; rcTextDIB.top++; DrawText(hdcTextDIB, pcszText, nChars, &rcTextDIB, uFlags); // Modify DIB: #ifndef __X64__ if (g_bSSE2) #endif { __m128i r0, r1, r2, r3, r4, r5, r6, r7; r7 = _mm_setzero_si128(); // 0 r6 = _mm_set1_epi32(clr.dw); // CA CR CG CB CA CR CG CB CA CR CG CB CA CR CG CB r6 = _mm_unpacklo_epi8(r7, r6); // CA<<8 CR<<8 CG<<8 CB<<8 CA<<8 CR<<8 CG<<8 CB<<8 r5 = _mm_set1_epi16(1); // 1 1 1 1 1 1 1 1 r4 = _mm_set1_epi32(0xFF); // FF FF FF FF r3 = _mm_set1_epi32(clrBkg.dw); // DA 0 0 0 DA 0 0 0 DA 0 0 0 DA 0 0 0 ieBGRA *py = pTextDIB; for (int y = iYH; y--; py += iTextDIBXW) { ieBGRA *px = py; for (int x_4 = (iXW+3)>>2; x_4--; px += 4) { r0 = _mm_load_si128((__m128i *)px); r1 = r0; r2 = r0; // X3 R3 G3 B3 X2 R2 G2 B2 X1 R1 G1 B1 X0 R0 G0 B0 r0 = _mm_srli_epi32(r0, 16); // 0 0 X3 R3 0 0 X2 R2 0 0 X1 R1 0 0 X0 R0 r1 = _mm_srli_epi32(r1, 8); // 0 X3 R3 G3 0 X2 R2 G2 0 X1 R1 G1 0 X0 R0 G0 r0 = _mm_max_epu8(r0, r2); r0 = _mm_max_epu8(r0, r1); // x x x A3 x x x A2 x x x A1 x x x A0 r0 = _mm_and_si128(r0, r4); // 0 A3 0 A2 0 A1 0 A0 r0 = _mm_shufflelo_epi16(r0, _MM_SHUFFLE(2,2,0,0)); r0 = _mm_shufflehi_epi16(r0, _MM_SHUFFLE(2,2,0,0)); // A3 A3 A2 A2 A1 A1 A0 A0 r1 = r0; r0 = _mm_unpacklo_epi32(r0, r0); // A1 A1 A1 A1 A0 A0 A0 A0 r1 = _mm_unpackhi_epi32(r1, r1); // A3 A3 A3 A3 A2 A2 A2 A2 r0 = _mm_add_epi16(r0, r5); // A1' A1' A1' A1' A0' A0' A0' A0' r1 = _mm_add_epi16(r1, r5); // A3' A3' A3' A3' A2' A2' A2' A2' r0 = _mm_mulhi_epu16(r0, r6); // xA1" xR1 xG1 xB1 xA0" xR0 xG0 xB0 r1 = _mm_mulhi_epu16(r1, r6); // xA3" xR3 xG3 xB3 xA2" xR2 xG2 xB2 r0 = _mm_packus_epi16(r0, r1); // xA3"xR3 xG3 xB3 xA2"xR2 xG2 xB2 xA1"xR1 xG1 xB1 xA0"xR0 xG0 xB0 r0 = _mm_adds_epu8(r0, r3); // xA3 xR3 xG3 xB3 xA2 xR2 xG2 xB2 xA1 xR1 xG1 xB1 xA0 xR0 xG0 xB0 _mm_store_si128((__m128i *)px, r0); } } } #ifndef __X64__ else {
/* Deinterleaves the 3 streams from the input (systematic and 2 parity bits) into * 3 buffers ready to be used by compute_gamma() */ void deinterleave_input(srslte_tdec_sse_t *h, int16_t *input, uint32_t long_cb) { uint32_t i; __m128i *inputPtr = (__m128i*) input; __m128i in0, in1, in2; __m128i s0, s1, s2, s; __m128i p00, p01, p02, p0; __m128i p10, p11, p12, p1; __m128i *sysPtr = (__m128i*) h->syst; __m128i *pa0Ptr = (__m128i*) h->parity0; __m128i *pa1Ptr = (__m128i*) h->parity1; // pick bits 0, 3, 6 from 1st word __m128i s0_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0); // pick bits 1, 4, 7 from 2st word __m128i s1_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff); // pick bits 2, 5 from 3rd word __m128i s2_mask = _mm_set_epi8(11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); // pick bits 1, 4, 7 from 1st word __m128i p00_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,15,14,9,8,3,2); // pick bits 2, 5, from 2st word __m128i p01_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff); // pick bits 0, 3, 6 from 3rd word __m128i p02_mask = _mm_set_epi8(13,12,7,6,1,0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); // pick bits 2, 5 from 1st word __m128i p10_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4); // pick bits 0, 3, 6, from 2st word __m128i p11_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0,0xff,0xff,0xff,0xff); // pick bits 1, 4, 7 from 3rd word __m128i p12_mask = _mm_set_epi8(15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); // Split systematic and parity bits for (i = 0; i < long_cb/8; i++) { in0 = _mm_load_si128(inputPtr); inputPtr++; in1 = _mm_load_si128(inputPtr); inputPtr++; in2 = _mm_load_si128(inputPtr); inputPtr++; /* Deinterleave Systematic bits */ s0 = _mm_shuffle_epi8(in0, s0_mask); s1 = _mm_shuffle_epi8(in1, s1_mask); s2 = _mm_shuffle_epi8(in2, s2_mask); s = _mm_or_si128(s0, s1); s = _mm_or_si128(s, s2); _mm_store_si128(sysPtr, s); sysPtr++; /* Deinterleave parity 0 bits */ p00 = _mm_shuffle_epi8(in0, p00_mask); p01 = _mm_shuffle_epi8(in1, p01_mask); p02 = _mm_shuffle_epi8(in2, p02_mask); p0 = _mm_or_si128(p00, p01); p0 = _mm_or_si128(p0, p02); _mm_store_si128(pa0Ptr, p0); pa0Ptr++; /* Deinterleave parity 1 bits */ p10 = _mm_shuffle_epi8(in0, p10_mask); p11 = _mm_shuffle_epi8(in1, p11_mask); p12 = _mm_shuffle_epi8(in2, p12_mask); p1 = _mm_or_si128(p10, p11); p1 = _mm_or_si128(p1, p12); _mm_store_si128(pa1Ptr, p1); pa1Ptr++; } for (i = 0; i < 3; i++) { h->syst[i+long_cb] = input[3*long_cb + 2*i]; h->parity0[i+long_cb] = input[3*long_cb + 2*i + 1]; } for (i = 0; i < 3; i++) { h->app2[i+long_cb] = input[3*long_cb + 6 + 2*i]; h->parity1[i+long_cb] = input[3*long_cb + 6 + 2*i + 1]; } }
static void build_integral_sse2(uint32_t *integral, int integral_stride, const uint8_t *src, const uint8_t *src_pre, const uint8_t *compare, const uint8_t *compare_pre, int w, int border, int dst_w, int dst_h, int dx, int dy) { const __m128i zero = _mm_set1_epi8(0); const int bw = w + 2 * border; for (int y = 0; y < dst_h; y++) { __m128i prevadd = _mm_set1_epi32(0); const uint8_t *p1 = src_pre + y*bw; const uint8_t *p2 = compare_pre + (y+dy)*bw + dx; uint32_t *out = integral + (y*integral_stride); for (int x = 0; x < dst_w; x += 16) { __m128i pa, pb; __m128i pla, plb; __m128i ldiff, lldiff, lhdiff; __m128i ltmp,htmp; __m128i ladd,hadd; __m128i pha,phb; __m128i hdiff,hldiff,hhdiff; __m128i l2tmp,h2tmp; pa = _mm_loadu_si128((__m128i*)p1); // Load source pixels into register 1 pb = _mm_loadu_si128((__m128i*)p2); // Load compare pixels into register 2 // Low pla = _mm_unpacklo_epi8(pa,zero); // Unpack and interleave source low with zeros plb = _mm_unpacklo_epi8(pb,zero); // Unpack and interleave compare low with zeros ldiff = _mm_sub_epi16(pla,plb); // Diff source and compare lows (subtract) ldiff = _mm_mullo_epi16(ldiff,ldiff); // Square low diff (multiply at 32-bit precision) lldiff = _mm_unpacklo_epi16(ldiff,zero); // Unpack and interleave diff low with zeros lhdiff = _mm_unpackhi_epi16(ldiff,zero); // Unpack and interleave diff high with zeros ltmp = _mm_slli_si128(lldiff, 4); // Temp shift diff low left 4 bytes lldiff = _mm_add_epi32(lldiff, ltmp); // Add above to diff low ltmp = _mm_slli_si128(lldiff, 8); // Temp shift diff low left 8 bytes lldiff = _mm_add_epi32(lldiff, ltmp); // Add above to diff low lldiff = _mm_add_epi32(lldiff, prevadd); // Add previous total to diff low ladd = _mm_shuffle_epi32(lldiff, 0xff); // Shuffle diff low htmp = _mm_slli_si128(lhdiff, 4); // Temp shift diff high left 4 bytes lhdiff = _mm_add_epi32(lhdiff, htmp); // Add above to diff high htmp = _mm_slli_si128(lhdiff, 8); // Temp shift diff high left 8 bytes lhdiff = _mm_add_epi32(lhdiff, htmp); // Add above to diff high lhdiff = _mm_add_epi32(lhdiff, ladd); // Add shuffled diff low to diff high prevadd = _mm_shuffle_epi32(lhdiff, 0xff); // Shuffle diff high // High pha = _mm_unpackhi_epi8(pa,zero); // Unpack and interleave source high with zeros phb = _mm_unpackhi_epi8(pb,zero); // Unpack and interleave compare high with zeros hdiff = _mm_sub_epi16(pha,phb); // Diff source and compare highs (subtract) hdiff = _mm_mullo_epi16(hdiff,hdiff); // Square high diff (multiply at 32-bit precision) hldiff = _mm_unpacklo_epi16(hdiff,zero); // Unpack and interleave diff low with zeros hhdiff = _mm_unpackhi_epi16(hdiff,zero); // Unpack and interleave diff high with zeros l2tmp = _mm_slli_si128(hldiff, 4); // Temp shift diff low 4 bytes hldiff = _mm_add_epi32(hldiff, l2tmp); // Add above to diff low l2tmp = _mm_slli_si128(hldiff, 8); // Temp shift diff low left 8 bytes hldiff = _mm_add_epi32(hldiff, l2tmp); // Add above to diff low hldiff = _mm_add_epi32(hldiff, prevadd); // Add previous total to diff low hadd = _mm_shuffle_epi32(hldiff, 0xff); // Shuffle diff low h2tmp = _mm_slli_si128(hhdiff, 4); // Temp shift diff high left 4 bytes hhdiff = _mm_add_epi32(hhdiff, h2tmp); // Add above to diff high h2tmp = _mm_slli_si128(hhdiff, 8); // Temp shift diff high left 8 bytes hhdiff = _mm_add_epi32(hhdiff, h2tmp); // Add above to diff high hhdiff = _mm_add_epi32(hhdiff, hadd); // Add shuffled diff low to diff high prevadd = _mm_shuffle_epi32(hhdiff, 0xff); // Shuffle diff high // Store _mm_store_si128((__m128i*)(out), lldiff); // Store low diff low in memory _mm_store_si128((__m128i*)(out+4), lhdiff); // Store low diff high in memory _mm_store_si128((__m128i*)(out+8), hldiff); // Store high diff low in memory _mm_store_si128((__m128i*)(out+12), hhdiff); // Store high diff high in memory // Increment out += 16; p1 += 16; p2 += 16; } if (y > 0) { out = integral + y*integral_stride; for (int x = 0; x < dst_w; x += 16) { *((__m128i*)out) = _mm_add_epi32(*(__m128i*)(out-integral_stride), *(__m128i*)(out)); *((__m128i*)(out+4)) = _mm_add_epi32(*(__m128i*)(out+4-integral_stride), *(__m128i*)(out+4)); *((__m128i*)(out+8)) = _mm_add_epi32(*(__m128i*)(out+8-integral_stride), *(__m128i*)(out+8)); *((__m128i*)(out+12)) = _mm_add_epi32(*(__m128i*)(out+12-integral_stride), *(__m128i*)(out+12)); out += 16; } } } }
pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi) { int lastRow, lastCol; BYTE *UData,*VData,*YData; int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV; __m128i r0,r1,r2,r3,r4,r5,r6,r7; __m128i *buffer; /* last_line: if the last (U,V doubled) line should be skipped, set to 10B * last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */ buffer = _aligned_malloc(4 * 16, 16); YData = (BYTE*) pSrc[0]; UData = (BYTE*) pSrc[1]; VData = (BYTE*) pSrc[2]; nWidth = roi->width; nHeight = roi->height; if ((lastCol = (nWidth & 3))) { switch (lastCol) { case 1: r7 = _mm_set_epi32(0,0,0,0xFFFFFFFF); break; case 2: r7 = _mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF); break; case 3: r7 = _mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF); break; } _mm_store_si128(buffer+3,r7); lastCol = 1; } nWidth += 3; nWidth = nWidth >> 2; lastRow = nHeight & 1; nHeight++; nHeight = nHeight >> 1; VaddDst = (dstStep << 1) - (nWidth << 4); VaddY = (srcStep[0] << 1) - (nWidth << 2); VaddU = srcStep[1] - (((nWidth << 1) + 2) & 0xFFFC); VaddV = srcStep[2] - (((nWidth << 1) + 2) & 0xFFFC); while (nHeight-- > 0) { if (nHeight == 0) lastRow <<= 1; i = 0; do { if (!(i & 0x01)) { /* Y-, U- and V-data is stored in different arrays. * We start with processing U-data. * * at first we fetch four U-values from its array and shuffle them like this: * 0d0d 0c0c 0b0b 0a0a * we've done two things: converting the values to signed words and duplicating * each value, because always two pixel "share" the same U- (and V-) data */ r0 = _mm_cvtsi32_si128(*(UINT32 *)UData); r5 = _mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000); r0 = _mm_shuffle_epi8(r0,r5); UData += 4; /* then we subtract 128 from each value, so we get D */ r3 = _mm_set_epi16(128,128,128,128,128,128,128,128); r0 = _mm_subs_epi16(r0,r3); /* we need to do two things with our D, so let's store it for later use */ r2 = r0; /* now we can multiply our D with 48 and unpack it to xmm4:xmm0 * this is what we need to get G data later on */ r4 = r0; r7 = _mm_set_epi16(48,48,48,48,48,48,48,48); r0 = _mm_mullo_epi16(r0,r7); r4 = _mm_mulhi_epi16(r4,r7); r7 = r0; r0 = _mm_unpacklo_epi16(r0,r4); r4 = _mm_unpackhi_epi16(r7,r4); /* to get B data, we need to prepare a second value, D*475 */ r1 = r2; r7 = _mm_set_epi16(475,475,475,475,475,475,475,475); r1 = _mm_mullo_epi16(r1,r7); r2 = _mm_mulhi_epi16(r2,r7); r7 = r1; r1 = _mm_unpacklo_epi16(r1,r2); r7 = _mm_unpackhi_epi16(r7,r2); /* so we got something like this: xmm7:xmm1 * this pair contains values for 16 pixel: * aabbccdd * aabbccdd, but we can only work on four pixel at once, so we need to save upper values */ _mm_store_si128(buffer+1,r7); /* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */ r2 = _mm_cvtsi32_si128(*(UINT32 *)VData); r2 = _mm_shuffle_epi8(r2,r5); VData += 4; r2 = _mm_subs_epi16(r2,r3); r5 = r2; /* this is also known as E*403, we need it to convert R data */ r3 = r2; r7 = _mm_set_epi16(403,403,403,403,403,403,403,403); r2 = _mm_mullo_epi16(r2,r7); r3 = _mm_mulhi_epi16(r3,r7); r7 = r2; r2 = _mm_unpacklo_epi16(r2,r3); r7 = _mm_unpackhi_epi16(r7,r3); /* and preserve upper four values for future ... */ _mm_store_si128(buffer+2,r7); /* doing this step: E*120 */ r3 = r5; r7 = _mm_set_epi16(120,120,120,120,120,120,120,120); r3 = _mm_mullo_epi16(r3,r7); r5 = _mm_mulhi_epi16(r5,r7); r7 = r3; r3 = _mm_unpacklo_epi16(r3,r5); r7 = _mm_unpackhi_epi16(r7,r5); /* now we complete what we've begun above: * (48*D) + (120*E) = (48*D +120*E) */ r0 = _mm_add_epi32(r0,r3); r4 = _mm_add_epi32(r4,r7); /* and store to memory ! */ _mm_store_si128(buffer,r4); } else { /* maybe you've wondered about the conditional above ? * Well, we prepared UV data for eight pixel in each line, but can only process four * per loop. So we need to load the upper four pixel data from memory each secound loop! */ r1 = _mm_load_si128(buffer+1); r2 = _mm_load_si128(buffer+2); r0 = _mm_load_si128(buffer); } if (++i == nWidth) lastCol <<= 1; /* We didn't produce any output yet, so let's do so! * Ok, fetch four pixel from the Y-data array and shuffle them like this: * 00d0 00c0 00b0 00a0, to get signed dwords and multiply by 256 */ r4 = _mm_cvtsi32_si128(*(UINT32 *)YData); r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); r4 = _mm_shuffle_epi8(r4,r7); r5 = r4; r6 = r4; /* no we can perform the "real" conversion itself and produce output! */ r4 = _mm_add_epi32(r4,r2); r5 = _mm_sub_epi32(r5,r0); r6 = _mm_add_epi32(r6,r1); /* in the end, we only need bytes for RGB values. * So, what do we do? right! shifting left makes values bigger and thats always good. * before we had dwords of data, and by shifting left and treating the result * as packed words, we get not only signed words, but do also divide by 256 * imagine, data is now ordered this way: ddx0 ccx0 bbx0 aax0, and x is the least * significant byte, that we don't need anymore, because we've done some rounding */ r4 = _mm_slli_epi32(r4,8); r5 = _mm_slli_epi32(r5,8); r6 = _mm_slli_epi32(r6,8); /* one thing we still have to face is the clip() function ... * we have still signed words, and there are those min/max instructions in SSE2 ... * the max instruction takes always the bigger of the two operands and stores it in the first one, * and it operates with signs ! * if we feed it with our values and zeros, it takes the zeros if our values are smaller than * zero and otherwise our values */ r7 = _mm_set_epi32(0,0,0,0); r4 = _mm_max_epi16(r4,r7); r5 = _mm_max_epi16(r5,r7); r6 = _mm_max_epi16(r6,r7); /* the same thing just completely different can be used to limit our values to 255, * but now using the min instruction and 255s */ r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_min_epi16(r4,r7); r5 = _mm_min_epi16(r5,r7); r6 = _mm_min_epi16(r6,r7); /* Now we got our bytes. * the moment has come to assemble the three channels R,G and B to the xrgb dwords * on Red channel we just have to and each futural dword with 00FF0000H */ //r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_and_si128(r4,r7); /* on Green channel we have to shuffle somehow, so we get something like this: * 00d0 00c0 00b0 00a0 */ r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); r5 = _mm_shuffle_epi8(r5,r7); /* and on Blue channel that one: * 000d 000c 000b 000a */ r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); r6 = _mm_shuffle_epi8(r6,r7); /* and at last we or it together and get this one: * xrgb xrgb xrgb xrgb */ r4 = _mm_or_si128(r4,r5); r4 = _mm_or_si128(r4,r6); /* Only thing to do know is writing data to memory, but this gets a bit more * complicated if the width is not a multiple of four and it is the last column in line. */ if (lastCol & 0x02) { /* let's say, we need to only convert six pixel in width * Ok, the first 4 pixel will be converted just like every 4 pixel else, but * if it's the last loop in line, last_column is shifted left by one (curious? have a look above), * and we land here. Through initialisation a mask was prepared. In this case it looks like * 0000FFFFH 0000FFFFH 0000FFFFH 0000FFFFH */ r6 = _mm_load_si128(buffer+3); /* we and our output data with this mask to get only the valid pixel */ r4 = _mm_and_si128(r4,r6); /* then we fetch memory from the destination array ... */ r5 = _mm_lddqu_si128((__m128i *)pDst); /* ... and and it with the inverse mask. We get only those pixel, which should not be updated */ r6 = _mm_andnot_si128(r6,r5); /* we only have to or the two values together and write it back to the destination array, * and only the pixel that should be updated really get changed. */ r4 = _mm_or_si128(r4,r6); } _mm_storeu_si128((__m128i *)pDst,r4); if (!(lastRow & 0x02)) { /* Because UV data is the same for two lines, we can process the secound line just here, * in the same loop. Only thing we need to do is to add some offsets to the Y- and destination * pointer. These offsets are iStride[0] and the target scanline. * But if we don't need to process the secound line, like if we are in the last line of processing nine lines, * we just skip all this. */ r4 = _mm_cvtsi32_si128(*(UINT32 *)(YData+srcStep[0])); r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); r4 = _mm_shuffle_epi8(r4,r7); r5 = r4; r6 = r4; r4 = _mm_add_epi32(r4,r2); r5 = _mm_sub_epi32(r5,r0); r6 = _mm_add_epi32(r6,r1); r4 = _mm_slli_epi32(r4,8); r5 = _mm_slli_epi32(r5,8); r6 = _mm_slli_epi32(r6,8); r7 = _mm_set_epi32(0,0,0,0); r4 = _mm_max_epi16(r4,r7); r5 = _mm_max_epi16(r5,r7); r6 = _mm_max_epi16(r6,r7); r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_min_epi16(r4,r7); r5 = _mm_min_epi16(r5,r7); r6 = _mm_min_epi16(r6,r7); r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_and_si128(r4,r7); r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); r5 = _mm_shuffle_epi8(r5,r7); r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); r6 = _mm_shuffle_epi8(r6,r7); r4 = _mm_or_si128(r4,r5); r4 = _mm_or_si128(r4,r6); if (lastCol & 0x02) { r6 = _mm_load_si128(buffer+3); r4 = _mm_and_si128(r4,r6); r5 = _mm_lddqu_si128((__m128i *)(pDst+dstStep)); r6 = _mm_andnot_si128(r6,r5); r4 = _mm_or_si128(r4,r6); /* only thing is, we should shift [rbp-42] back here, because we have processed the last column, * and this "special condition" can be released */ lastCol >>= 1; } _mm_storeu_si128((__m128i *)(pDst+dstStep),r4); } /* after all we have to increase the destination- and Y-data pointer by four pixel */ pDst += 16; YData += 4; }
// @return true iff the two pages differ; false otherwise. // @note Uses SSE3, so you must compile with -msse3. bool pagesDifferent (const void * b1, const void * b2) { enum { PAGE_SIZE = 4096 }; // Make a mask, initially all 1's. register __m128i mask = _mm_setzero_si128(); mask = _mm_cmpeq_epi32(mask, mask); __m128i * buf1 = (__m128i *) b1; __m128i * buf2 = (__m128i *) b2; // Some vectorizing pragamata here; not sure if gcc implements them. #pragma vector always for (int i = 0; i < PAGE_SIZE / sizeof(__m128i); i += 8) { #pragma ivdep #pragma vector aligned register __m128i xmm1, xmm2; // Unrolled loop for speed: we load two 128-bit chunks, // and logically AND in their comparison. // If the mask gets any zero bits, the bytes differ. xmm1 = _mm_load_si128 (&buf1[i]); xmm2 = _mm_load_si128 (&buf2[i]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); xmm1 = _mm_load_si128 (&buf1[i+1]); xmm2 = _mm_load_si128 (&buf2[i+1]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); xmm1 = _mm_load_si128 (&buf1[i+2]); xmm2 = _mm_load_si128 (&buf2[i+2]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); xmm1 = _mm_load_si128 (&buf1[i+3]); xmm2 = _mm_load_si128 (&buf2[i+3]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); xmm1 = _mm_load_si128 (&buf1[i+4]); xmm2 = _mm_load_si128 (&buf2[i+4]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); xmm1 = _mm_load_si128 (&buf1[i+5]); xmm2 = _mm_load_si128 (&buf2[i+5]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); xmm1 = _mm_load_si128 (&buf1[i+6]); xmm2 = _mm_load_si128 (&buf2[i+6]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); xmm1 = _mm_load_si128 (&buf1[i+7]); xmm2 = _mm_load_si128 (&buf2[i+7]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); // Save the mask to see whether we have found a difference or not. unsigned long long buf[128 / sizeof(unsigned long long) / 8] __attribute__((aligned(16))); _mm_store_si128 ((__m128i *) &buf, mask); // IMPORTANT: make sure long long = 64bits! enum { VERIFY_LONGLONG_64 = 1 / (sizeof(long long) == 8) }; // Now check the result. // Both buf[0] and buf[1] should be all ones. if ((buf[0] != (unsigned long long) -1) || (buf[1] != (unsigned long long) -1)) { return true; } } // No differences found. return false; }
int crypto_hash(unsigned char *out,const unsigned char *in,unsigned long long inlen) { hashState state; u_int32_t *data32, *data32_end; u_int64_t *data64; unsigned char *lastPartP, *data8_end; #ifdef __x86_64__ u_int64_t i, iterations, counter, databyteLength; #else int i, iterations, counter, databyteLength; #endif // This might be a static check if (crypto_hash_BYTES != 32) return -1; databyteLength = inlen; // Want it to be the native data size, and not bigger. #ifdef __SSE__ // Use SSE here, if it is available _mm_store_si128((__m128i *) &hashState256_(state).DoublePipe[0], _mm_load_si128((__m128i *) &i256p2[0])); _mm_store_si128((__m128i *) &hashState256_(state).DoublePipe[4], _mm_load_si128((__m128i *) &i256p2[4])); _mm_store_si128((__m128i *) &hashState256_(state).DoublePipe[8], _mm_load_si128((__m128i *) &i256p2[8])); _mm_store_si128((__m128i *) &hashState256_(state).DoublePipe[12], _mm_load_si128((__m128i *) &i256p2[12])); #elif defined ( __x86_64__ ) // Or 64-bit writes if on 64 bit system (not really possible on x86) hashState256_(state).DoublePipe[0] = i256p2[0]; hashState256_(state).DoublePipe[2] = i256p2[2]; hashState256_(state).DoublePipe[4] = i256p2[4]; hashState256_(state).DoublePipe[6] = i256p2[6]; hashState256_(state).DoublePipe[8] = i256p2[8]; hashState256_(state).DoublePipe[10] = i256p2[10]; hashState256_(state).DoublePipe[12] = i256p2[12]; hashState256_(state).DoublePipe[14] = i256p2[14]; #else // Fallback memcpy(hashState256_(state).DoublePipe, i256p2, 16 * sizeof(u_int32_t)); #endif data32 = (u_int32_t *) in; iterations = databyteLength / BlueMidnightWish256_BLOCK_SIZE; data32_end = data32 + iterations*16; if(iterations > 0) Compress256(data32, data32_end, &state); databyteLength -= BlueMidnightWish256_BLOCK_SIZE * iterations; data64 = (u_int64_t *)hashState256_(state).LastPart; if (databyteLength < 56) { #ifdef __SSE__ // Use SSE here, if it is available __m128i zero = _mm_setzero_si128(); _mm_store_si128((__m128i *) &data64[0], zero); _mm_store_si128((__m128i *) &data64[2], zero); _mm_store_si128((__m128i *) &data64[4], zero); _mm_store_si128((__m128i *) &data64[6], zero); #elif defined ( __x86_64__ ) // Or 64-bit writes if on 64 bit system (not really possible on x86) data64[0] = 0; data64[1] = 0; data64[2] = 0; data64[3] = 0; data64[4] = 0; data64[5] = 0; data64[6] = 0; data64[7] = 0; #else // Fallback memset( data64 + (databyteLength >> 4), 0x00, BlueMidnightWish256_BLOCK_SIZE - ((databyteLength >> 4) << 3)); #endif }
static pstatus_t sse2_set_32u( UINT32 val, UINT32* pDst, UINT32 len) { const primitives_t* prim = primitives_get_generic(); UINT32* dptr = (UINT32*) pDst; __m128i xmm0; size_t count; /* If really short, just do it here. */ if (len < 32) { while (len--) *dptr++ = val; return PRIMITIVES_SUCCESS; } /* Assure we can reach 16-byte alignment. */ if (((ULONG_PTR) dptr & 0x03) != 0) { return prim->set_32u(val, pDst, len); } /* Seek 16-byte alignment. */ while ((ULONG_PTR) dptr & 0x0f) { *dptr++ = val; if (--len == 0) return PRIMITIVES_SUCCESS; } xmm0 = _mm_set1_epi32(val); /* Cover 256-byte chunks via SSE register stores. */ count = len >> 6; len -= count << 6; /* Do 256-byte chunks using one XMM register. */ while (count--) { _mm_store_si128((__m128i*) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 4; } /* Cover 16-byte chunks via SSE register stores. */ count = len >> 2; len -= count << 2; /* Do 16-byte chunks using one XMM register. */ while (count--) { _mm_store_si128((__m128i*) dptr, xmm0); dptr += 4; } /* Do leftover bytes. */ while (len--) *dptr++ = val; return PRIMITIVES_SUCCESS; }
PRBool gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf, const gfxImageSurface* whiteSurf) { gfxIntSize size = blackSurf->GetSize(); if (size != whiteSurf->GetSize() || (blackSurf->Format() != gfxASurface::ImageFormatARGB32 && blackSurf->Format() != gfxASurface::ImageFormatRGB24) || (whiteSurf->Format() != gfxASurface::ImageFormatARGB32 && whiteSurf->Format() != gfxASurface::ImageFormatRGB24)) return PR_FALSE; blackSurf->Flush(); whiteSurf->Flush(); unsigned char* blackData = blackSurf->Data(); unsigned char* whiteData = whiteSurf->Data(); if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) || (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) { // Cannot keep these in alignment. return PR_FALSE; } __m128i greenMask = _mm_load_si128((__m128i*)greenMaski); __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski); for (PRInt32 i = 0; i < size.height; ++i) { PRInt32 j = 0; // Loop single pixels until at 4 byte alignment. while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) { *((PRUint32*)blackData) = RecoverPixel(*reinterpret_cast<PRUint32*>(blackData), *reinterpret_cast<PRUint32*>(whiteData)); blackData += 4; whiteData += 4; j++; } // This extra loop allows the compiler to do some more clever registry // management and makes it about 5% faster than with only the 4 pixel // at a time loop. for (; j < size.width - 8; j += 8) { __m128i black1 = _mm_load_si128((__m128i*)blackData); __m128i white1 = _mm_load_si128((__m128i*)whiteData); __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16)); __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16)); // Execute the same instructions as described in RecoverPixel, only // using an SSE2 packed saturated subtract. white1 = _mm_subs_epu8(white1, black1); white2 = _mm_subs_epu8(white2, black2); white1 = _mm_subs_epu8(greenMask, white1); white2 = _mm_subs_epu8(greenMask, white2); // Producing the final black pixel in an XMM register and storing // that is actually faster than doing a masked store since that // does an unaligned storage. We have the black pixel in a register // anyway. black1 = _mm_andnot_si128(alphaMask, black1); black2 = _mm_andnot_si128(alphaMask, black2); white1 = _mm_slli_si128(white1, 2); white2 = _mm_slli_si128(white2, 2); white1 = _mm_and_si128(alphaMask, white1); white2 = _mm_and_si128(alphaMask, white2); black1 = _mm_or_si128(white1, black1); black2 = _mm_or_si128(white2, black2); _mm_store_si128((__m128i*)blackData, black1); _mm_store_si128((__m128i*)(blackData + 16), black2); blackData += 32; whiteData += 32; } for (; j < size.width - 4; j += 4) { __m128i black = _mm_load_si128((__m128i*)blackData); __m128i white = _mm_load_si128((__m128i*)whiteData); white = _mm_subs_epu8(white, black); white = _mm_subs_epu8(greenMask, white); black = _mm_andnot_si128(alphaMask, black); white = _mm_slli_si128(white, 2); white = _mm_and_si128(alphaMask, white); black = _mm_or_si128(white, black); _mm_store_si128((__m128i*)blackData, black); blackData += 16; whiteData += 16; } // Loop single pixels until we're done. while (j < size.width) { *((PRUint32*)blackData) = RecoverPixel(*reinterpret_cast<PRUint32*>(blackData), *reinterpret_cast<PRUint32*>(whiteData)); blackData += 4; whiteData += 4; j++; } blackData += blackSurf->Stride() - j * 4; whiteData += whiteSurf->Stride() - j * 4; } blackSurf->MarkDirty(); return PR_TRUE; }
static pstatus_t sse2_set_8u( BYTE val, BYTE* pDst, UINT32 len) { BYTE byte, *dptr; __m128i xmm0; size_t count; if (len < 16) return generic->set_8u(val, pDst, len); byte = val; dptr = (BYTE*) pDst; /* Seek 16-byte alignment. */ while ((ULONG_PTR) dptr & 0x0f) { *dptr++ = byte; if (--len == 0) return PRIMITIVES_SUCCESS; } xmm0 = _mm_set1_epi8(byte); /* Cover 256-byte chunks via SSE register stores. */ count = len >> 8; len -= count << 8; /* Do 256-byte chunks using one XMM register. */ while (count--) { _mm_store_si128((__m128i*) dptr, xmm0); dptr += 16; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 16; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 16; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 16; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 16; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 16; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 16; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 16; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 16; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 16; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 16; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 16; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 16; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 16; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 16; _mm_store_si128((__m128i*) dptr, xmm0); dptr += 16; } /* Cover 16-byte chunks via SSE register stores. */ count = len >> 4; len -= count << 4; /* Do 16-byte chunks using one XMM register. */ while (count--) { _mm_store_si128((__m128i*) dptr, xmm0); dptr += 16; } /* Do leftover bytes. */ while (len--) *dptr++ = byte; return PRIMITIVES_SUCCESS; }
static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; __m256i srcReg32b1, srcReg32b2, filtersReg32; unsigned int i; ptrdiff_t src_stride, dst_stride; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 16 bits (first and second byte) // across 256 bit register firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); // multiple the size of the source and destination stride by two src_stride = src_pixels_per_line << 1; dst_stride = output_pitch << 1; for (i = output_height; i > 1; i-=2) { // load the 2 strides of source srcReg32b1 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr - 3))); srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, _mm_loadu_si128((const __m128i *) (src_ptr+src_pixels_per_line-3)), 1); // filter the source buffer srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); // filter the source buffer srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) srcReg32b2 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + 5))); srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, _mm_loadu_si128((const __m128i *) (src_ptr+src_pixels_per_line+5)), 1); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); // filter the source buffer srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); // filter the source buffer srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); // shift by 7 bit each 16 bit srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); src_ptr+=src_stride; // save 16 bytes _mm_store_si128((__m128i*)output_ptr, _mm256_castsi256_si128(srcRegFilt32b1_1)); // save the next 16 bits _mm_store_si128((__m128i*)(output_ptr+output_pitch), _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); output_ptr+=dst_stride; } // if the number of strides is odd. // process only 16 bytes if (i > 0) { __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; __m128i srcRegFilt2, srcRegFilt3; srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); // filter the source buffer srcRegFilt3= _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); srcRegFilt2= _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); // reading the next 16 bytes // (part of it was being read by earlier read) srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); // filter the source buffer srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); // filter the source buffer srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64)); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64)); // shift by 7 bit each 16 bit srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); // save 16 bytes _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); } }
_declspec(dllexport) DiffResult __stdcall diff_img(Image left, Image right, DiffOptions options) { if (options.ignoreColor) { makeGreyscale(left); makeGreyscale(right); } float* imgMem = (float*)_aligned_malloc(left.width * left.height * sizeof(float) * 4, 16); int colorOffset = left.width * left.height; Image diff = { left.width, left.height, left.stride, imgMem, imgMem + colorOffset, imgMem + colorOffset * 2, imgMem + colorOffset * 3 }; float* drp = diff.r; float* dgp = diff.g; float* dbp = diff.b; float* dap = diff.a; float* lrp = left.r; float* lgp = left.g; float* lbp = left.b; float* lap = left.a; float* rrp = right.r; float* rgp = right.g; float* rbp = right.b; float* rap = right.a; Color error = ConvertToFloat(options.errorColor); auto er = _mm_set_ps1(error.r); auto eg = _mm_set_ps1(error.g); auto eb = _mm_set_ps1(error.b); auto ea = _mm_set_ps1(error.a); auto tolerance = _mm_set_ps1(options.tolerance); auto overlayTransparency = _mm_set_ps1(options.overlayTransparency); OverlayType overlayType = options.overlayType; byte weightByDiffPercentage = options.weightByDiffPercentage; auto diffPixelCount = _mm_set_epi32(0, 0, 0, 0); auto onei = _mm_set1_epi32(1); auto one = _mm_set1_ps(1); auto zero = _mm_set1_ps(0); for (int y = 0; y < left.height; y++) { for (int x = 0; x < left.width; x+=4) { auto lr = _mm_load_ps(lrp); auto lg = _mm_load_ps(lgp); auto lb = _mm_load_ps(lbp); auto la = _mm_load_ps(lap); auto rr = _mm_load_ps(rrp); auto rg = _mm_load_ps(rgp); auto rb = _mm_load_ps(rbp); auto ra = _mm_load_ps(rap); auto rdiff = _mm_sub_ps(rr, lr); auto gdiff = _mm_sub_ps(rg, lg); auto bdiff = _mm_sub_ps(rb, lb); auto adiff = _mm_sub_ps(ra, la); auto distance = _mm_mul_ps(rdiff, rdiff); distance = _mm_add_ps(distance, _mm_mul_ps(gdiff, gdiff)); distance = _mm_add_ps(distance, _mm_mul_ps(bdiff, bdiff)); distance = _mm_add_ps(distance, _mm_mul_ps(adiff, adiff)); distance = _mm_sqrt_ps(distance); auto t = overlayTransparency; if (weightByDiffPercentage) { t = _mm_mul_ps(t, distance); } auto isdiff = _mm_cmpgt_ps(distance, tolerance); t = _mm_min_ps(one, _mm_max_ps(zero, t)); auto mlr = rr; auto mlg = rg; auto mlb = rb; auto mla = ra; if (overlayType == OverlayType::Movement) { mlr = _mm_mul_ps(mlr, er); mlg = _mm_mul_ps(mlg, eg); mlb = _mm_mul_ps(mlb, eb); mla = _mm_mul_ps(mla, ea); } auto oneMinusT = _mm_sub_ps(one, t); auto mixedR = _mm_add_ps(_mm_mul_ps(mlr, oneMinusT), _mm_mul_ps(er, t)); auto mixedG = _mm_add_ps(_mm_mul_ps(mlg, oneMinusT), _mm_mul_ps(eg, t)); auto mixedB = _mm_add_ps(_mm_mul_ps(mlb, oneMinusT), _mm_mul_ps(eb, t)); auto mixedA = one; if (overlayType != OverlayType::Movement) { mixedA = _mm_add_ps(_mm_mul_ps(mla, oneMinusT), _mm_mul_ps(ea, t)); } // (((b ^ a) & mask)^a) auto dr = _mm_xor_ps(lr, _mm_and_ps(isdiff, _mm_xor_ps(mixedR, lr))); auto dg = _mm_xor_ps(lg, _mm_and_ps(isdiff, _mm_xor_ps(mixedG, lg))); auto db = _mm_xor_ps(lb, _mm_and_ps(isdiff, _mm_xor_ps(mixedB, lb))); auto da = _mm_xor_ps(la, _mm_and_ps(isdiff, _mm_xor_ps(mixedA, la))); diffPixelCount = _mm_xor_si128(diffPixelCount, _mm_and_si128(_mm_castps_si128(isdiff), _mm_xor_si128(_mm_add_epi32(diffPixelCount, onei), diffPixelCount))); _mm_store_ps(drp, dr); _mm_store_ps(dgp, dg); _mm_store_ps(dbp, db); _mm_store_ps(dap, da); drp+=4; dgp+=4; dbp+=4; dap+=4; lrp+=4; lgp+=4; lbp+=4; lap+=4; rrp+=4; rgp+=4; rbp+=4; rap+=4; } } int* pixelCounts = (int*)_aligned_malloc(4 * sizeof(int), 16); _mm_store_si128((__m128i*)pixelCounts, diffPixelCount); int totalCount = pixelCounts[0] + pixelCounts[1] + pixelCounts[2] + pixelCounts[3]; _aligned_free(pixelCounts); return{ diff, 1.0f - float(totalCount) / (left.height * left.width - left.height * left.stride) }; }
void fb_slvn_low(dig_t *c, const dig_t *a) { int i; dig_t *p, u0, u1, u2, u3; void *tab = fb_poly_get_slv(); __m128i m0, m1, m2, m3, m4, sqrt0, sqrt1, mask0, mask1, mask2, r0, r1, t0, t1, perm; perm = _mm_set_epi32(0x0F0D0B09, 0x07050301, 0x0E0C0A08, 0x06040200); mask2 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000); mask1 = _mm_set_epi32(0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0); mask0 = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F); sqrt0 = _mm_set_epi32(0x03020302, 0x01000100, 0x03020302, 0x01000100); sqrt1 = _mm_set_epi32(0x0c080c08, 0x04000400, 0x0c080c08, 0x04000400); t0 = _mm_load_si128((__m128i *)a); t1 = _mm_load_si128((__m128i *)(a + 2)); r0 = r1 = _mm_setzero_si128(); m0 = _mm_shuffle_epi8(t1, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_slli_si128(m1, 8); m1 = _mm_and_si128(m1, mask2); m1 = _mm_slli_epi64(m1, 4); m1 = _mm_xor_si128(m1, m2); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m0 = _mm_and_si128(t0, mask2); m0 = _mm_shuffle_epi8(m0, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_srli_si128(m1, 8); m1 = _mm_andnot_si128(mask2, m1); m2 = _mm_slli_epi64(m2, 4); m1 = _mm_xor_si128(m1, m2); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_si128(t0, 4); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0xFFFFFFFF)); m0 = _mm_shuffle_epi8(m1, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_slli_si128(m1, 8); m1 = _mm_slli_epi64(m1, 4); m1 = _mm_xor_si128(m1, m2); m1 = _mm_srli_si128(m1, 6); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_si128(t0, 2); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0xFFFF)); m0 = _mm_shuffle_epi8(m1, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_slli_si128(m1, 8); m1 = _mm_slli_epi64(m1, 4); m1 = _mm_xor_si128(m1, m2); m1 = _mm_srli_si128(m1, 7); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_si128(t0, 1); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x55)); m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 1)); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x33)); m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 2)); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x0F)); m1 = _mm_slli_epi64(m1, 4); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_epi64(t0, 4); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x5)); m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 1)); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x3)); m1 = _mm_slli_epi64(m1, 2); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_epi64(t0, 2); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x1)); m1 = _mm_slli_epi64(m1, 1); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); sqrt0 = _mm_set_epi32(0x03030202, 0x03030202, 0x01010000, 0x01010000); sqrt1 = _mm_set_epi32(0x0C0C0808, 0x0C0C0808, 0x04040000, 0x04040000); m1 = _mm_and_si128(t0, mask0); m2 = _mm_and_si128(t0, mask1); m3 = _mm_and_si128(t1, mask0); m4 = _mm_and_si128(t1, mask1); m2 = _mm_srli_epi64(m2, 4); m4 = _mm_srli_epi64(m4, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m4 = _mm_shuffle_epi8(sqrt1, m4); m3 = _mm_shuffle_epi8(sqrt0, m3); m1 = _mm_or_si128(m1, m2); m3 = _mm_or_si128(m3, m4); #ifndef __PCLMUL__ align dig_t x[2]; _mm_store_si128((__m128i *)x, m1); u0 = x[0]; u1 = x[1]; _mm_store_si128((__m128i *)x, m3); u2 = x[0]; u3 = x[1]; #else u0 = _mm_extract_epi64(m1, 0); u1 = _mm_extract_epi64(m1, 1); u2 = _mm_extract_epi64(m3, 0); u3 = _mm_extract_epi64(m3, 1); #endif for (i = 0; i < 8; i++) { p = (dig_t *)(tab + (16 * i + (u0 & 0x0F)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u0 >>= 8; p = (dig_t *)(tab + (16 * (i + 8) + (u1 & 0x0F)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u1 >>= 8; p = (dig_t *)(tab + (16 * (i + 16) + (u2 & 0x0F)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u2 >>= 8; p = (dig_t *)(tab + (16 * (i + 24) + (u3 & 0xF)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u3 >>= 8; } _mm_store_si128((__m128i *)c, r0); _mm_store_si128((__m128i *)(c + 2), r1); }
void BM3D_Basic_Process::CollaborativeFilter(int plane, FLType *ResNum, FLType *ResDen, const FLType *src, const FLType *ref, const PosPairCode &code) const { PCType GroupSize = static_cast<PCType>(code.size()); // When para.GroupSize > 0, limit GroupSize up to para.GroupSize if (d.para.GroupSize > 0 && GroupSize > d.para.GroupSize) { GroupSize = d.para.GroupSize; } // Construct source group guided by matched pos code block_group srcGroup(src, src_stride[plane], code, GroupSize, d.para.BlockSize, d.para.BlockSize); // Initialize retianed coefficients of hard threshold filtering int retainedCoefs = 0; // Apply forward 3D transform to the source group d.f[plane].fp[GroupSize - 1].execute_r2r(srcGroup.data(), srcGroup.data()); // Apply hard-thresholding to the source group auto srcp = srcGroup.data(); auto thrp = d.f[plane].thrTable[GroupSize - 1].get(); const auto upper = srcp + srcGroup.size(); #if defined(__SSE2__) static const ptrdiff_t simd_step = 4; const ptrdiff_t simd_residue = srcGroup.size() % simd_step; const ptrdiff_t simd_width = srcGroup.size() - simd_residue; static const __m128 zero_ps = _mm_setzero_ps(); __m128i cmp_sum = _mm_setzero_si128(); for (const auto upper1 = srcp + simd_width; srcp < upper1; srcp += simd_step, thrp += simd_step) { const __m128 s1 = _mm_load_ps(srcp); const __m128 t1p = _mm_load_ps(thrp); const __m128 t1n = _mm_sub_ps(zero_ps, t1p); const __m128 cmp1 = _mm_cmpgt_ps(s1, t1p); const __m128 cmp2 = _mm_cmplt_ps(s1, t1n); const __m128 cmp = _mm_or_ps(cmp1, cmp2); const __m128 d1 = _mm_and_ps(cmp, s1); _mm_store_ps(srcp, d1); cmp_sum = _mm_sub_epi32(cmp_sum, _mm_castps_si128(cmp)); } alignas(16) int32_t cmp_sum_i32[4]; _mm_store_si128(reinterpret_cast<__m128i *>(cmp_sum_i32), cmp_sum); retainedCoefs += cmp_sum_i32[0] + cmp_sum_i32[1] + cmp_sum_i32[2] + cmp_sum_i32[3]; #endif for (; srcp < upper; ++srcp, ++thrp) { if (*srcp > *thrp || *srcp < -*thrp) { ++retainedCoefs; } else { *srcp = 0; } } // Apply backward 3D transform to the filtered group d.f[plane].bp[GroupSize - 1].execute_r2r(srcGroup.data(), srcGroup.data()); // Calculate weight for the filtered group // Also include the normalization factor to compensate for the amplification introduced in 3D transform FLType denWeight = retainedCoefs < 1 ? 1 : FLType(1) / static_cast<FLType>(retainedCoefs); FLType numWeight = static_cast<FLType>(denWeight / d.f[plane].finalAMP[GroupSize - 1]); // Store the weighted filtered group to the numerator part of the basic estimation // Store the weight to the denominator part of the basic estimation srcGroup.AddTo(ResNum, dst_stride[plane], numWeight); srcGroup.CountTo(ResDen, dst_stride[plane], denWeight); }
rfx_dwt_2d_decode_block_horiz_sse2(INT16* l, INT16* h, INT16* dst, int subband_width) { int y, n; INT16* l_ptr = l; INT16* h_ptr = h; INT16* dst_ptr = dst; int first; int last; __m128i l_n; __m128i h_n; __m128i h_n_m; __m128i tmp_n; __m128i dst_n; __m128i dst_n_p; __m128i dst1; __m128i dst2; for (y = 0; y < subband_width; y++) { /* Even coefficients */ for (n = 0; n < subband_width; n += 8) { /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */ l_n = _mm_load_si128((__m128i*) l_ptr); h_n = _mm_load_si128((__m128i*) h_ptr); h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1)); if (n == 0) { first = _mm_extract_epi16(h_n_m, 1); h_n_m = _mm_insert_epi16(h_n_m, first, 0); } tmp_n = _mm_add_epi16(h_n, h_n_m); tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1)); tmp_n = _mm_srai_epi16(tmp_n, 1); dst_n = _mm_sub_epi16(l_n, tmp_n); _mm_store_si128((__m128i*) l_ptr, dst_n); l_ptr += 8; h_ptr += 8; } l_ptr -= subband_width; h_ptr -= subband_width; /* Odd coefficients */ for (n = 0; n < subband_width; n += 8) { /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */ h_n = _mm_load_si128((__m128i*) h_ptr); h_n = _mm_slli_epi16(h_n, 1); dst_n = _mm_load_si128((__m128i*) (l_ptr)); dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1)); if (n == subband_width - 8) { last = _mm_extract_epi16(dst_n_p, 6); dst_n_p = _mm_insert_epi16(dst_n_p, last, 7); } tmp_n = _mm_add_epi16(dst_n_p, dst_n); tmp_n = _mm_srai_epi16(tmp_n, 1); tmp_n = _mm_add_epi16(tmp_n, h_n); dst1 = _mm_unpacklo_epi16(dst_n, tmp_n); dst2 = _mm_unpackhi_epi16(dst_n, tmp_n); _mm_store_si128((__m128i*) dst_ptr, dst1); _mm_store_si128((__m128i*) (dst_ptr + 8), dst2); l_ptr += 8; h_ptr += 8; dst_ptr += 16; } } }
EvalSum& operator = (const EvalSum& rhs) { _mm_store_si128(&m[0], rhs.m[0]); _mm_store_si128(&m[1], rhs.m[1]); return *this; }