int oneThread(int threadId) { int *aa; int *bb; int k; int itr; aa = (int *)_mm_malloc(sizeof(int)*ARRAY_SIZE, 16); bb = (int *)_mm_malloc(sizeof(int)*ARRAY_SIZE, 16); memset(&aa[0], 1, ARRAY_SIZE*4); memset(&bb[0], 2, ARRAY_SIZE*4); __m128i a0,a1,a2,a3,b0,b1,b2,b3; __m128i a4,a5,a6,a7,b4,b5,b6,b7; __m128i c0,c1,c2,c3; __m128i c4,c5,c6,c7; __m128i cc; cc = _mm_set_epi32 (0, 0, 0, 0); for (k = 0; k < REPS; k++) { for (itr = 0; itr<ARRAY_SIZE; itr+=32) { a0 = _mm_load_si128((__m128i*)&aa[itr]); a1 = _mm_load_si128((__m128i*)&aa[itr+4]); a2 = _mm_load_si128((__m128i*)&aa[itr+8]); a3 = _mm_load_si128((__m128i*)&aa[itr+12]); a4 = _mm_load_si128((__m128i*)&aa[itr+16]); a5 = _mm_load_si128((__m128i*)&aa[itr+20]); a6 = _mm_load_si128((__m128i*)&aa[itr+24]); a7 = _mm_load_si128((__m128i*)&aa[itr+28]); b0 = _mm_load_si128((__m128i*)&bb[itr]); b1 = _mm_load_si128((__m128i*)&bb[itr+4]); b2 = _mm_load_si128((__m128i*)&bb[itr+8]); b3 = _mm_load_si128((__m128i*)&bb[itr+12]); b4 = _mm_load_si128((__m128i*)&bb[itr+16]); b5 = _mm_load_si128((__m128i*)&bb[itr+20]); b6 = _mm_load_si128((__m128i*)&bb[itr+24]); b7 = _mm_load_si128((__m128i*)&bb[itr+28]); c0 = _mm_mul_epi32(a0, b0); c1 = _mm_mul_epi32(a1, b1); c2 = _mm_mul_epi32(a2, b2); c3 = _mm_mul_epi32(a3, b3); c4 = _mm_mul_epi32(a4, b4); c5 = _mm_mul_epi32(a5, b5); c6 = _mm_mul_epi32(a6, b6); c7 = _mm_mul_epi32(a7, b7); c0 = _mm_add_epi32(c0,c1); c1 = _mm_add_epi32(c2,c3); c2 = _mm_add_epi32(c4,c5); c3 = _mm_add_epi32(c6,c7); c0 = _mm_add_epi32(c0,c1); c1 = _mm_add_epi32(c2,c3); c0 = _mm_add_epi32(c0,c1); cc = _mm_add_epi32(cc,c0); } } cc = _mm_hadd_epi32(cc,cc); cc = _mm_hadd_epi32(cc,cc); int count =0; count = _mm_cvtsi128_si32(cc) ; free(aa); free(bb); return count; }
PRBool gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf, const gfxImageSurface* whiteSurf) { gfxIntSize size = blackSurf->GetSize(); if (size != whiteSurf->GetSize() || (blackSurf->Format() != gfxASurface::ImageFormatARGB32 && blackSurf->Format() != gfxASurface::ImageFormatRGB24) || (whiteSurf->Format() != gfxASurface::ImageFormatARGB32 && whiteSurf->Format() != gfxASurface::ImageFormatRGB24)) return PR_FALSE; blackSurf->Flush(); whiteSurf->Flush(); unsigned char* blackData = blackSurf->Data(); unsigned char* whiteData = whiteSurf->Data(); if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) || (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) { // Cannot keep these in alignment. return PR_FALSE; } __m128i greenMask = _mm_load_si128((__m128i*)greenMaski); __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski); for (PRInt32 i = 0; i < size.height; ++i) { PRInt32 j = 0; // Loop single pixels until at 4 byte alignment. while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) { *((PRUint32*)blackData) = RecoverPixel(*reinterpret_cast<PRUint32*>(blackData), *reinterpret_cast<PRUint32*>(whiteData)); blackData += 4; whiteData += 4; j++; } // This extra loop allows the compiler to do some more clever registry // management and makes it about 5% faster than with only the 4 pixel // at a time loop. for (; j < size.width - 8; j += 8) { __m128i black1 = _mm_load_si128((__m128i*)blackData); __m128i white1 = _mm_load_si128((__m128i*)whiteData); __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16)); __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16)); // Execute the same instructions as described in RecoverPixel, only // using an SSE2 packed saturated subtract. white1 = _mm_subs_epu8(white1, black1); white2 = _mm_subs_epu8(white2, black2); white1 = _mm_subs_epu8(greenMask, white1); white2 = _mm_subs_epu8(greenMask, white2); // Producing the final black pixel in an XMM register and storing // that is actually faster than doing a masked store since that // does an unaligned storage. We have the black pixel in a register // anyway. black1 = _mm_andnot_si128(alphaMask, black1); black2 = _mm_andnot_si128(alphaMask, black2); white1 = _mm_slli_si128(white1, 2); white2 = _mm_slli_si128(white2, 2); white1 = _mm_and_si128(alphaMask, white1); white2 = _mm_and_si128(alphaMask, white2); black1 = _mm_or_si128(white1, black1); black2 = _mm_or_si128(white2, black2); _mm_store_si128((__m128i*)blackData, black1); _mm_store_si128((__m128i*)(blackData + 16), black2); blackData += 32; whiteData += 32; } for (; j < size.width - 4; j += 4) { __m128i black = _mm_load_si128((__m128i*)blackData); __m128i white = _mm_load_si128((__m128i*)whiteData); white = _mm_subs_epu8(white, black); white = _mm_subs_epu8(greenMask, white); black = _mm_andnot_si128(alphaMask, black); white = _mm_slli_si128(white, 2); white = _mm_and_si128(alphaMask, white); black = _mm_or_si128(white, black); _mm_store_si128((__m128i*)blackData, black); blackData += 16; whiteData += 16; } // Loop single pixels until we're done. while (j < size.width) { *((PRUint32*)blackData) = RecoverPixel(*reinterpret_cast<PRUint32*>(blackData), *reinterpret_cast<PRUint32*>(whiteData)); blackData += 4; whiteData += 4; j++; } blackData += blackSurf->Stride() - j * 4; whiteData += whiteSurf->Stride() - j * 4; } blackSurf->MarkDirty(); return PR_TRUE; }
rfx_dwt_2d_decode_block_horiz_sse2(INT16* l, INT16* h, INT16* dst, int subband_width) { int y, n; INT16* l_ptr = l; INT16* h_ptr = h; INT16* dst_ptr = dst; int first; int last; __m128i l_n; __m128i h_n; __m128i h_n_m; __m128i tmp_n; __m128i dst_n; __m128i dst_n_p; __m128i dst1; __m128i dst2; for (y = 0; y < subband_width; y++) { /* Even coefficients */ for (n = 0; n < subband_width; n += 8) { /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */ l_n = _mm_load_si128((__m128i*) l_ptr); h_n = _mm_load_si128((__m128i*) h_ptr); h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1)); if (n == 0) { first = _mm_extract_epi16(h_n_m, 1); h_n_m = _mm_insert_epi16(h_n_m, first, 0); } tmp_n = _mm_add_epi16(h_n, h_n_m); tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1)); tmp_n = _mm_srai_epi16(tmp_n, 1); dst_n = _mm_sub_epi16(l_n, tmp_n); _mm_store_si128((__m128i*) l_ptr, dst_n); l_ptr += 8; h_ptr += 8; } l_ptr -= subband_width; h_ptr -= subband_width; /* Odd coefficients */ for (n = 0; n < subband_width; n += 8) { /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */ h_n = _mm_load_si128((__m128i*) h_ptr); h_n = _mm_slli_epi16(h_n, 1); dst_n = _mm_load_si128((__m128i*) (l_ptr)); dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1)); if (n == subband_width - 8) { last = _mm_extract_epi16(dst_n_p, 6); dst_n_p = _mm_insert_epi16(dst_n_p, last, 7); } tmp_n = _mm_add_epi16(dst_n_p, dst_n); tmp_n = _mm_srai_epi16(tmp_n, 1); tmp_n = _mm_add_epi16(tmp_n, h_n); dst1 = _mm_unpacklo_epi16(dst_n, tmp_n); dst2 = _mm_unpackhi_epi16(dst_n, tmp_n); _mm_store_si128((__m128i*) dst_ptr, dst1); _mm_store_si128((__m128i*) (dst_ptr + 8), dst2); l_ptr += 8; h_ptr += 8; dst_ptr += 16; } } }
mlib_status mlib_VideoColorBGR2JFIFYCC444_S16_aligned( mlib_s16 *y, mlib_s16 *cb, mlib_s16 *cr, const mlib_s16 *bgr, mlib_s32 n) { /* 0.299*32768 */ const __m128i x_c11 = _mm_set1_epi16(9798); /* 0.587*32768 */ const __m128i x_c12 = _mm_set1_epi16(19235); /* 0.114*32768 */ const __m128i x_c13 = _mm_set1_epi16(3735); /* -0.16874*32768 */ const __m128i x_c21 = _mm_set1_epi16(-5529); /* -0.33126*32768 */ const __m128i x_c22 = _mm_set1_epi16(-10855); /* 0.5*32768 */ const __m128i x_c23 = _mm_set1_epi16(16384); /* 0.5*32768 */ const __m128i x_c31 = x_c23; /* -0.41869*32768 */ const __m128i x_c32 = _mm_set1_epi16(-13720); /* -0.08131*32768 */ const __m128i x_c33 = _mm_set1_epi16(-2664); /* 2048 */ const __m128i x_coff = _mm_set1_epi16(2048 << 2); const __m128i x_zero = _mm_setzero_si128(); __m128i x_bgr0, x_bgr1, x_bgr2, x_r, x_g, x_b; __m128i x_y, x_cb, x_cr; __m128i x_t0, x_t1, x_t2, x_t3, x_t4, x_t5; __m128i *px_y, *px_cb, *px_cr, *px_bgr; mlib_d64 fr, fg, fb, fy, fcb, fcr; mlib_s32 i; px_y = (__m128i *)y; px_cb = (__m128i *)cb; px_cr = (__m128i *)cr; px_bgr = (__m128i *)bgr; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (n - 8); i += 8) { x_bgr0 = _mm_load_si128(px_bgr++); x_bgr0 = _mm_slli_epi16(x_bgr0, 3); x_bgr1 = _mm_load_si128(px_bgr++); x_bgr1 = _mm_slli_epi16(x_bgr1, 3); x_bgr2 = _mm_load_si128(px_bgr++); x_bgr2 = _mm_slli_epi16(x_bgr2, 3); SeparateBGR48_S16; x_t0 = _mm_mulhi_epi16(x_r, x_c11); x_t1 = _mm_mulhi_epi16(x_g, x_c12); x_t2 = _mm_mulhi_epi16(x_b, x_c13); x_y = _mm_add_epi16(x_t0, x_t1); x_y = _mm_add_epi16(x_y, x_t2); x_t0 = _mm_mulhi_epi16(x_r, x_c21); x_t1 = _mm_mulhi_epi16(x_g, x_c22); x_t2 = _mm_mulhi_epi16(x_b, x_c23); x_cb = _mm_add_epi16(x_t0, x_t1); x_cb = _mm_add_epi16(x_cb, x_coff); x_cb = _mm_add_epi16(x_cb, x_t2); x_t0 = _mm_mulhi_epi16(x_r, x_c31); x_t1 = _mm_mulhi_epi16(x_g, x_c32); x_t2 = _mm_mulhi_epi16(x_b, x_c33); x_cr = _mm_add_epi16(x_t0, x_t1); x_cr = _mm_add_epi16(x_cr, x_coff); x_cr = _mm_add_epi16(x_cr, x_t2); /* save */ x_y = _mm_srli_epi16(x_y, 2); x_cb = _mm_srli_epi16(x_cb, 2); x_cr = _mm_srli_epi16(x_cr, 2); _mm_store_si128(px_y++, x_y); _mm_store_si128(px_cb++, x_cb); _mm_store_si128(px_cr++, x_cr); } if (i <= (n - 4)) { x_bgr0 = _mm_load_si128(px_bgr++); x_bgr0 = _mm_slli_epi16(x_bgr0, 3); x_bgr1 = _mm_loadl_epi64(px_bgr); x_bgr1 = _mm_slli_epi16(x_bgr1, 3); px_bgr = (__m128i *)((__m64 *)px_bgr + 1); SeparateBGR24_S16; x_t0 = _mm_mulhi_epi16(x_r, x_c11); x_t1 = _mm_mulhi_epi16(x_g, x_c12); x_t2 = _mm_mulhi_epi16(x_b, x_c13); x_y = _mm_add_epi16(x_t0, x_t1); x_y = _mm_add_epi16(x_y, x_t2); x_t0 = _mm_mulhi_epi16(x_r, x_c21); x_t1 = _mm_mulhi_epi16(x_g, x_c22); x_t2 = _mm_mulhi_epi16(x_b, x_c23); x_cb = _mm_add_epi16(x_t0, x_t1); x_cb = _mm_add_epi16(x_cb, x_coff); x_cb = _mm_add_epi16(x_cb, x_t2); x_t0 = _mm_mulhi_epi16(x_r, x_c31); x_t1 = _mm_mulhi_epi16(x_g, x_c32); x_t2 = _mm_mulhi_epi16(x_b, x_c33); x_cr = _mm_add_epi16(x_t0, x_t1); x_cr = _mm_add_epi16(x_cr, x_coff); x_cr = _mm_add_epi16(x_cr, x_t2); /* save */ x_y = _mm_srli_epi16(x_y, 2); x_cb = _mm_srli_epi16(x_cb, 2); x_cr = _mm_srli_epi16(x_cr, 2); _mm_storel_epi64(px_y, x_y); px_y = (__m128i *)((__m64 *)px_y + 1); _mm_storel_epi64(px_cb, x_cb); px_cb = (__m128i *)((__m64 *)px_cb + 1); _mm_storel_epi64(px_cr, x_cr); px_cr = (__m128i *)((__m64 *)px_cr + 1); i += 4; } for (; i <= (n - 1); i++) { fb = bgr[3 * i]; fg = bgr[3 * i + 1]; fr = bgr[3 * i + 2]; fy = 0.29900f * fr + 0.58700f * fg + 0.11400f * fb; fcb = -0.16874f * fr - 0.33126f * fg + 0.50000f * fb + 2048; fcr = 0.50000f * fr - 0.41869f * fg - 0.08131f * fb + 2048; y[i] = (mlib_s16)fy; cb[i] = (mlib_s16)fcb; cr[i] = (mlib_s16)fcr; } return (MLIB_SUCCESS); }
int crypto_hash(unsigned char *out,const unsigned char *in,unsigned long long inlen) { hashState state; u_int32_t *data32, *data32_end; u_int64_t *data64; unsigned char *lastPartP, *data8_end; #ifdef __x86_64__ u_int64_t i, iterations, counter, databyteLength; #else int i, iterations, counter, databyteLength; #endif // This might be a static check if (crypto_hash_BYTES != 32) return -1; databyteLength = inlen; // Want it to be the native data size, and not bigger. #ifdef __SSE__ // Use SSE here, if it is available _mm_store_si128((__m128i *) &hashState256_(state).DoublePipe[0], _mm_load_si128((__m128i *) &i256p2[0])); _mm_store_si128((__m128i *) &hashState256_(state).DoublePipe[4], _mm_load_si128((__m128i *) &i256p2[4])); _mm_store_si128((__m128i *) &hashState256_(state).DoublePipe[8], _mm_load_si128((__m128i *) &i256p2[8])); _mm_store_si128((__m128i *) &hashState256_(state).DoublePipe[12], _mm_load_si128((__m128i *) &i256p2[12])); #elif defined ( __x86_64__ ) // Or 64-bit writes if on 64 bit system (not really possible on x86) hashState256_(state).DoublePipe[0] = i256p2[0]; hashState256_(state).DoublePipe[2] = i256p2[2]; hashState256_(state).DoublePipe[4] = i256p2[4]; hashState256_(state).DoublePipe[6] = i256p2[6]; hashState256_(state).DoublePipe[8] = i256p2[8]; hashState256_(state).DoublePipe[10] = i256p2[10]; hashState256_(state).DoublePipe[12] = i256p2[12]; hashState256_(state).DoublePipe[14] = i256p2[14]; #else // Fallback memcpy(hashState256_(state).DoublePipe, i256p2, 16 * sizeof(u_int32_t)); #endif data32 = (u_int32_t *) in; iterations = databyteLength / BlueMidnightWish256_BLOCK_SIZE; data32_end = data32 + iterations*16; if(iterations > 0) Compress256(data32, data32_end, &state); databyteLength -= BlueMidnightWish256_BLOCK_SIZE * iterations; data64 = (u_int64_t *)hashState256_(state).LastPart; if (databyteLength < 56) { #ifdef __SSE__ // Use SSE here, if it is available __m128i zero = _mm_setzero_si128(); _mm_store_si128((__m128i *) &data64[0], zero); _mm_store_si128((__m128i *) &data64[2], zero); _mm_store_si128((__m128i *) &data64[4], zero); _mm_store_si128((__m128i *) &data64[6], zero); #elif defined ( __x86_64__ ) // Or 64-bit writes if on 64 bit system (not really possible on x86) data64[0] = 0; data64[1] = 0; data64[2] = 0; data64[3] = 0; data64[4] = 0; data64[5] = 0; data64[6] = 0; data64[7] = 0; #else // Fallback memset( data64 + (databyteLength >> 4), 0x00, BlueMidnightWish256_BLOCK_SIZE - ((databyteLength >> 4) << 3)); #endif }
/* Deinterleaves the 3 streams from the input (systematic and 2 parity bits) into * 3 buffers ready to be used by compute_gamma() */ void deinterleave_input(srslte_tdec_sse_t *h, int16_t *input, uint32_t long_cb) { uint32_t i; __m128i *inputPtr = (__m128i*) input; __m128i in0, in1, in2; __m128i s0, s1, s2, s; __m128i p00, p01, p02, p0; __m128i p10, p11, p12, p1; __m128i *sysPtr = (__m128i*) h->syst; __m128i *pa0Ptr = (__m128i*) h->parity0; __m128i *pa1Ptr = (__m128i*) h->parity1; // pick bits 0, 3, 6 from 1st word __m128i s0_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0); // pick bits 1, 4, 7 from 2st word __m128i s1_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff); // pick bits 2, 5 from 3rd word __m128i s2_mask = _mm_set_epi8(11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); // pick bits 1, 4, 7 from 1st word __m128i p00_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,15,14,9,8,3,2); // pick bits 2, 5, from 2st word __m128i p01_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff); // pick bits 0, 3, 6 from 3rd word __m128i p02_mask = _mm_set_epi8(13,12,7,6,1,0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); // pick bits 2, 5 from 1st word __m128i p10_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4); // pick bits 0, 3, 6, from 2st word __m128i p11_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0,0xff,0xff,0xff,0xff); // pick bits 1, 4, 7 from 3rd word __m128i p12_mask = _mm_set_epi8(15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff); // Split systematic and parity bits for (i = 0; i < long_cb/8; i++) { in0 = _mm_load_si128(inputPtr); inputPtr++; in1 = _mm_load_si128(inputPtr); inputPtr++; in2 = _mm_load_si128(inputPtr); inputPtr++; /* Deinterleave Systematic bits */ s0 = _mm_shuffle_epi8(in0, s0_mask); s1 = _mm_shuffle_epi8(in1, s1_mask); s2 = _mm_shuffle_epi8(in2, s2_mask); s = _mm_or_si128(s0, s1); s = _mm_or_si128(s, s2); _mm_store_si128(sysPtr, s); sysPtr++; /* Deinterleave parity 0 bits */ p00 = _mm_shuffle_epi8(in0, p00_mask); p01 = _mm_shuffle_epi8(in1, p01_mask); p02 = _mm_shuffle_epi8(in2, p02_mask); p0 = _mm_or_si128(p00, p01); p0 = _mm_or_si128(p0, p02); _mm_store_si128(pa0Ptr, p0); pa0Ptr++; /* Deinterleave parity 1 bits */ p10 = _mm_shuffle_epi8(in0, p10_mask); p11 = _mm_shuffle_epi8(in1, p11_mask); p12 = _mm_shuffle_epi8(in2, p12_mask); p1 = _mm_or_si128(p10, p11); p1 = _mm_or_si128(p1, p12); _mm_store_si128(pa1Ptr, p1); pa1Ptr++; } for (i = 0; i < 3; i++) { h->syst[i+long_cb] = input[3*long_cb + 2*i]; h->parity0[i+long_cb] = input[3*long_cb + 2*i + 1]; } for (i = 0; i < 3; i++) { h->app2[i+long_cb] = input[3*long_cb + 6 + 2*i]; h->parity1[i+long_cb] = input[3*long_cb + 6 + 2*i + 1]; } }
void lp_rast_triangle_3_16(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) { const struct lp_rast_triangle *tri = arg.triangle.tri; const struct lp_rast_plane *plane = GET_PLANES(tri); int x = (arg.triangle.plane_mask & 0xff) + task->x; int y = (arg.triangle.plane_mask >> 8) + task->y; unsigned i, j; struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16]; unsigned nr = 0; __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */ __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */ __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */ __m128i zero = _mm_setzero_si128(); __m128i c; __m128i dcdx; __m128i dcdy; __m128i rej4; __m128i dcdx2; __m128i dcdx3; __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ __m128i unused; transpose4_epi32(&p0, &p1, &p2, &zero, &c, &dcdx, &dcdy, &rej4); /* Adjust dcdx; */ dcdx = _mm_sub_epi32(zero, dcdx); c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); rej4 = _mm_slli_epi32(rej4, 2); /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */ c = _mm_sub_epi32(c, _mm_set1_epi32(1)); rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1)); dcdx2 = _mm_add_epi32(dcdx, dcdx); dcdx3 = _mm_add_epi32(dcdx2, dcdx); transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, &span_0, &span_1, &span_2, &unused); for (i = 0; i < 4; i++) { __m128i cx = c; for (j = 0; j < 4; j++) { __m128i c4rej = _mm_add_epi32(cx, rej4); __m128i rej_masks = _mm_srai_epi32(c4rej, 31); /* if (is_zero(rej_masks)) */ if (_mm_movemask_epi8(rej_masks) == 0) { __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0); __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1); __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2); __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0); __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0)); __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1)); __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2)); __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1); __m128i c_01 = _mm_packs_epi32(c_0, c_1); __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0)); __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1)); __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2)); __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2); __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0)); __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1)); __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2)); __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3); __m128i c_23 = _mm_packs_epi32(c_2, c_3); __m128i c_0123 = _mm_packs_epi16(c_01, c_23); unsigned mask = _mm_movemask_epi8(c_0123); out[nr].i = i; out[nr].j = j; out[nr].mask = mask; if (mask != 0xffff) nr++; } cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2)); } c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2)); } for (i = 0; i < nr; i++) lp_rast_shade_quads_mask(task, &tri->inputs, x + 4 * out[i].j, y + 4 * out[i].i, 0xffff & ~out[i].mask); }
void GetMinMaxColors_Intrinsics( const byte *colorBlock, byte *minColor, byte *maxColor ) { __m128i t0, t1, t3, t4, t6, t7; // get bounding box // ---------------- // load the first row t0 = _mm_load_si128 ( (__m128i*) colorBlock ); t1 = _mm_load_si128 ( (__m128i*) colorBlock ); __m128i t16 = _mm_load_si128 ( (__m128i*) (colorBlock+16) ); // Minimum of Packed Unsigned Byte Integers t0 = _mm_min_epu8 ( t0, t16); // Maximum of Packed Unsigned Byte Integers t1 = _mm_max_epu8 ( t1, t16); __m128i t32 = _mm_load_si128 ( (__m128i*) (colorBlock+32) ); t0 = _mm_min_epu8 ( t0, t32); t1 = _mm_max_epu8 ( t1, t32); __m128i t48 = _mm_load_si128 ( (__m128i*) (colorBlock+48) ); t0 = _mm_min_epu8 ( t0, t48); t1 = _mm_max_epu8 ( t1, t48); // Shuffle Packed Doublewords t3 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t4 = _mm_shuffle_epi32( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t0 = _mm_min_epu8 ( t0, t3); t1 = _mm_max_epu8 ( t1, t4); // Shuffle Packed Low Words t6 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t7 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t0 = _mm_min_epu8 ( t0, t6); t1 = _mm_max_epu8 ( t1, t7); // inset the bounding box // ---------------------- // Unpack Low Data //__m128i t66 = _mm_set1_epi8( 0 ); __m128i t66 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_0 ); t0 = _mm_unpacklo_epi8(t0, t66); t1 = _mm_unpacklo_epi8(t1, t66); // copy (movdqa) //__m128i t2 = _mm_load_si128 ( &t1 ); __m128i t2 = t1; // Subtract Packed Integers t2 = _mm_sub_epi16(t2, t0); // Shift Packed Data Right Logical t2 = _mm_srli_epi16(t2, INSET_SHIFT); // Add Packed Integers t0 = _mm_add_epi16(t0, t2); t1 = _mm_sub_epi16(t1, t2); // Pack with Unsigned Saturation t0 = _mm_packus_epi16(t0, t0); t1 = _mm_packus_epi16(t1, t1); // store bounding box extents // -------------------------- _mm_store_si128 ( (__m128i*) minColor, t0 ); _mm_store_si128 ( (__m128i*) maxColor, t1 ); }
int haraka512256(unsigned char *hash, const unsigned char *msg) { // stuff we need int i, j; __m128i s[4], tmp, rcon; __m128i MSB64 = _mm_set_epi32(0xFFFFFFFF,0xFFFFFFFF,0,0); // set initial round constant rcon = _mm_set_epi32(1,1,1,1); // initialize state to msg s[0] = _mm_load_si128(&((__m128i*)msg)[0]); s[1] = _mm_load_si128(&((__m128i*)msg)[1]); s[2] = _mm_load_si128(&((__m128i*)msg)[2]); s[3] = _mm_load_si128(&((__m128i*)msg)[3]); //printf("= input state =\n"); //printstate512(s[0], s[1], s[2], s[3]); for (i = 0; i < ROUNDS; ++i) { // aes round(s) for (j = 0; j < AES_PER_ROUND; ++j) { s[0] = _mm_aesenc_si128(s[0], rcon); s[1] = _mm_aesenc_si128(s[1], rcon); s[2] = _mm_aesenc_si128(s[2], rcon); s[3] = _mm_aesenc_si128(s[3], rcon); rcon = _mm_slli_epi32(rcon, 1); } //printf("= round %d : after aes layer =\n", i); //printstate512(s[0], s[1], s[2], s[3]); // mixing tmp = _mm_unpacklo_epi32(s[0], s[1]); s[0] = _mm_unpackhi_epi32(s[0], s[1]); s[1] = _mm_unpacklo_epi32(s[2], s[3]); s[2] = _mm_unpackhi_epi32(s[2], s[3]); s[3] = _mm_unpacklo_epi32(s[0], s[2]); s[0] = _mm_unpackhi_epi32(s[0], s[2]); s[2] = _mm_unpackhi_epi32(s[1], tmp); s[1] = _mm_unpacklo_epi32(s[1], tmp); //printf("= round %d : after mix layer =\n", i); //printstate512(s[0], s[1], s[2], s[3]); // little-endian mixing (not used) // tmp = _mm_unpackhi_epi32(s[1], s[0]); // s[0] = _mm_unpacklo_epi32(s[1], s[0]); // s[1] = _mm_unpackhi_epi32(s[3], s[2]); // s[2] = _mm_unpacklo_epi32(s[3], s[2]); // s[3] = _mm_unpackhi_epi32(s[2], s[0]); // s[0] = _mm_unpacklo_epi32(s[2], s[0]); // s[2] = _mm_unpacklo_epi32(tmp, s[1]); // s[1] = _mm_unpackhi_epi32(tmp, s[1]); } //printf("= output from permutation =\n"); //printstate512(s[0], s[1], s[2], s[3]); // xor message to get DM effect s[0] = _mm_xor_si128(s[0], _mm_load_si128(&((__m128i*)msg)[0])); s[1] = _mm_xor_si128(s[1], _mm_load_si128(&((__m128i*)msg)[1])); s[2] = _mm_xor_si128(s[2], _mm_load_si128(&((__m128i*)msg)[2])); s[3] = _mm_xor_si128(s[3], _mm_load_si128(&((__m128i*)msg)[3])); //printf("= after feed-forward =\n"); //printstate512(s[0], s[1], s[2], s[3]); // truncate and store result _mm_maskmoveu_si128(s[0], MSB64, (hash-8)); _mm_maskmoveu_si128(s[1], MSB64, (hash+0)); _mm_storel_epi64((__m128i*)(hash + 16), s[2]); _mm_storel_epi64((__m128i*)(hash + 24), s[3]); }
__strspn_sse42 (const char *s, const char *a) { if (*a == 0) return 0; const char *aligned; __m128i mask; int offset = (int) ((size_t) a & 15); if (offset != 0) { /* Load masks. */ aligned = (const char *) ((size_t) a & -16L); __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); mask = __m128i_shift_right (mask0, offset); /* Find where the NULL terminator is. */ int length = _mm_cmpistri (mask, mask, 0x3a); if (length == 16 - offset) { /* There is no NULL terminator. */ __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); int index = _mm_cmpistri (mask1, mask1, 0x3a); length += index; /* Don't use SSE4.2 if the length of A > 16. */ if (length > 16) return __strspn_sse2 (s, a); if (index != 0) { /* Combine mask0 and mask1. We could play games with palignr, but frankly this data should be in L1 now so do the merge via an unaligned load. */ mask = _mm_loadu_si128 ((__m128i *) a); } } } else { /* A is aligned. */ mask = _mm_load_si128 ((__m128i *) a); /* Find where the NULL terminator is. */ int length = _mm_cmpistri (mask, mask, 0x3a); if (length == 16) { /* There is no NULL terminator. Don't use SSE4.2 if the length of A > 16. */ if (a[16] != 0) return __strspn_sse2 (s, a); } } offset = (int) ((size_t) s & 15); if (offset != 0) { /* Check partial string. */ aligned = (const char *) ((size_t) s & -16L); __m128i value = _mm_load_si128 ((__m128i *) aligned); value = __m128i_shift_right (value, offset); int length = _mm_cmpistri (mask, value, 0x12); /* No need to check CFlag since it is always 1. */ if (length < 16 - offset) return length; /* Find where the NULL terminator is. */ int index = _mm_cmpistri (value, value, 0x3a); if (index < 16 - offset) return length; aligned += 16; } else aligned = s; while (1) { __m128i value = _mm_load_si128 ((__m128i *) aligned); int index = _mm_cmpistri (mask, value, 0x12); int cflag = _mm_cmpistrc (mask, value, 0x12); if (cflag) return (size_t) (aligned + index - s); aligned += 16; } }
void EmitColorIndices_Intrinsics( const byte *colorBlock, const byte *minColor, const byte *maxColor, byte *&outData ) { ALIGN16( byte color0[16] ); ALIGN16( byte color1[16] ); ALIGN16( byte color2[16] ); ALIGN16( byte color3[16] ); ALIGN16( byte result[16] ); // mov esi, maxColor // mov edi, minColor __m128i t0, t1, t2, t3, t4, t5, t6, t7; t7 = _mm_setzero_si128(); //t7 = _mm_xor_si128(t7, t7); _mm_store_si128 ( (__m128i*) &result, t7 ); //t0 = _mm_load_si128 ( (__m128i*) maxColor ); t0 = _mm_cvtsi32_si128( *(int*)maxColor); // Bitwise AND __m128i tt = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_colorMask ); t0 = _mm_and_si128(t0, tt); t0 = _mm_unpacklo_epi8(t0, t7); t4 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 0, 3, 2, 3 )); t5 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 3, 1, 3, 3 )); t4 = _mm_srli_epi16(t4, 5); t5 = _mm_srli_epi16(t5, 6); // Bitwise Logical OR t0 = _mm_or_si128(t0, t4); t0 = _mm_or_si128(t0, t5); // t0 contains color0 in 565 //t1 = _mm_load_si128 ( (__m128i*) minColor ); t1 = _mm_cvtsi32_si128( *(int*)minColor); t1 = _mm_and_si128(t1, tt); t1 = _mm_unpacklo_epi8(t1, t7); t4 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 0, 3, 2, 3 )); t5 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 3, 1, 3, 3 )); t4 = _mm_srli_epi16(t4, 5); t5 = _mm_srli_epi16(t5, 6); t1 = _mm_or_si128(t1, t4); t1 = _mm_or_si128(t1, t5); // t1 contains color1 in 565 t2 = t0; t2 = _mm_packus_epi16(t2, t7); t2 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color0, t2 ); t6 = t0; t6 = _mm_add_epi16(t6, t0); t6 = _mm_add_epi16(t6, t1); // Multiply Packed Signed Integers and Store High Result __m128i tw3 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_div_by_3 ); t6 = _mm_mulhi_epi16(t6, tw3); t6 = _mm_packus_epi16(t6, t7); t6 = _mm_shuffle_epi32( t6, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color2, t6 ); t3 = t1; t3 = _mm_packus_epi16(t3, t7); t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color1, t3 ); t1 = _mm_add_epi16(t1, t1); t0 = _mm_add_epi16(t0, t1); t0 = _mm_mulhi_epi16(t0, tw3); t0 = _mm_packus_epi16(t0, t7); t0 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color3, t0 ); __m128i w0 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_0); __m128i w1 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_1); __m128i w2 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_2); // mov eax, 32 // mov esi, colorBlock int x = 32; //const byte *c = colorBlock; while (x >= 0) { t3 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+0)); t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 2, 1, 3 )); t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+8)); t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 )); t0 = t3; t6 = t5; // Compute Sum of Absolute Difference __m128i c0 = _mm_load_si128 ( (__m128i*) color0 ); t0 = _mm_sad_epu8(t0, c0); t6 = _mm_sad_epu8(t6, c0); // Pack with Signed Saturation t0 = _mm_packs_epi32 (t0, t6); t1 = t3; t6 = t5; __m128i c1 = _mm_load_si128 ( (__m128i*) color1 ); t1 = _mm_sad_epu8(t1, c1); t6 = _mm_sad_epu8(t6, c1); t1 = _mm_packs_epi32 (t1, t6); t2 = t3; t6 = t5; __m128i c2 = _mm_load_si128 ( (__m128i*) color2 ); t2 = _mm_sad_epu8(t2, c2); t6 = _mm_sad_epu8(t6, c2); t2 = _mm_packs_epi32 (t2, t6); __m128i c3 = _mm_load_si128 ( (__m128i*) color3 ); t3 = _mm_sad_epu8(t3, c3); t5 = _mm_sad_epu8(t5, c3); t3 = _mm_packs_epi32 (t3, t5); t4 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+16)); t4 = _mm_shuffle_epi32( t4, R_SHUFFLE_D( 0, 2, 1, 3 )); t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+24)); t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 )); t6 = t4; t7 = t5; t6 = _mm_sad_epu8(t6, c0); t7 = _mm_sad_epu8(t7, c0); t6 = _mm_packs_epi32 (t6, t7); t0 = _mm_packs_epi32 (t0, t6); // d0 t6 = t4; t7 = t5; t6 = _mm_sad_epu8(t6, c1); t7 = _mm_sad_epu8(t7, c1); t6 = _mm_packs_epi32 (t6, t7); t1 = _mm_packs_epi32 (t1, t6); // d1 t6 = t4; t7 = t5; t6 = _mm_sad_epu8(t6, c2); t7 = _mm_sad_epu8(t7, c2); t6 = _mm_packs_epi32 (t6, t7); t2 = _mm_packs_epi32 (t2, t6); // d2 t4 = _mm_sad_epu8(t4, c3); t5 = _mm_sad_epu8(t5, c3); t4 = _mm_packs_epi32 (t4, t5); t3 = _mm_packs_epi32 (t3, t4); // d3 t7 = _mm_load_si128 ( (__m128i*) result ); t7 = _mm_slli_epi32( t7, 16); t4 = t0; t5 = t1; // Compare Packed Signed Integers for Greater Than t0 = _mm_cmpgt_epi16(t0, t3); // b0 t1 = _mm_cmpgt_epi16(t1, t2); // b1 t4 = _mm_cmpgt_epi16(t4, t2); // b2 t5 = _mm_cmpgt_epi16(t5, t3); // b3 t2 = _mm_cmpgt_epi16(t2, t3); // b4 t4 = _mm_and_si128(t4, t1); // x0 t5 = _mm_and_si128(t5, t0); // x1 t2 = _mm_and_si128(t2, t0); // x2 t4 = _mm_or_si128(t4, t5); t2 = _mm_and_si128(t2, w1); t4 = _mm_and_si128(t4, w2); t2 = _mm_or_si128(t2, t4); t5 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 2, 3, 0, 1 )); // Unpack Low Data t2 = _mm_unpacklo_epi16 ( t2, w0); t5 = _mm_unpacklo_epi16 ( t5, w0); //t5 = _mm_slli_si128 ( t5, 8); t5 = _mm_slli_epi32( t5, 8); t7 = _mm_or_si128(t7, t5); t7 = _mm_or_si128(t7, t2); _mm_store_si128 ( (__m128i*) &result, t7 ); x -=32; } t4 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 1, 2, 3, 0 )); t5 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 2, 3, 0, 1 )); t6 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 3, 0, 1, 2 )); t4 = _mm_slli_epi32 ( t4, 2); t5 = _mm_slli_epi32 ( t5, 4); t6 = _mm_slli_epi32 ( t6, 6); t7 = _mm_or_si128(t7, t4); t7 = _mm_or_si128(t7, t5); t7 = _mm_or_si128(t7, t6); //_mm_store_si128 ( (__m128i*) outData, t7 ); int r = _mm_cvtsi128_si32 (t7); memcpy(outData, &r, 4); // Anything better ? outData += 4; }
mlib_status __mlib_VectorSumAbsDiff_S32_Sat( mlib_d64 *z, const mlib_s32 *x, const mlib_s32 *y, mlib_s32 n) { if (n <= 0) return (MLIB_FAILURE); mlib_s32 i, nstep, ax, ay, n1, n2, n3; mlib_s32 *px = (mlib_s32 *)x, *py = (mlib_s32 *)y; __m128i zero, xbuf, ybuf, zbuf, xlo, xhi, mext; mlib_d64 dsum = 0.0; zero = _mm_setzero_si128(); zbuf = zero; nstep = 16 / sizeof (mlib_s32); ax = (mlib_addr)x & 15; ay = (mlib_addr)y & 15; n1 = ((16 - ax) & 15) / sizeof (mlib_s32); n2 = (n - n1) / nstep; n3 = n - n1 - n2 * nstep; if (n2 < 1) { for (i = 0; i < n; i++) { dsum += mlib_fabs((mlib_d64)(*px++) - (*py++)); } *z = dsum; } else { for (i = 0; i < n1; i++) { dsum += mlib_fabs((mlib_d64)(*px++) - (*py++)); } if (ax == ay) { for (i = 0; i < n2; i++) { xbuf = _mm_load_si128((__m128i *)px); ybuf = _mm_load_si128((__m128i *)py); mext = _mm_cmpgt_epi32(ybuf, xbuf); xbuf = _mm_sub_epi32(xbuf, ybuf); xbuf = _mm_xor_si128(xbuf, mext); xbuf = _mm_sub_epi32(xbuf, mext); xlo = _mm_unpacklo_epi32(xbuf, zero); xhi = _mm_unpackhi_epi32(xbuf, zero); zbuf = _mm_add_epi64(zbuf, xlo); zbuf = _mm_add_epi64(zbuf, xhi); px += nstep; py += nstep; } } else { for (i = 0; i < n2; i++) { xbuf = _mm_load_si128((__m128i *)px); ybuf = _mm_loadu_si128((__m128i *)py); mext = _mm_cmpgt_epi32(ybuf, xbuf); xbuf = _mm_sub_epi32(xbuf, ybuf); xbuf = _mm_xor_si128(xbuf, mext); xbuf = _mm_sub_epi32(xbuf, mext); xlo = _mm_unpacklo_epi32(xbuf, zero); xhi = _mm_unpackhi_epi32(xbuf, zero); zbuf = _mm_add_epi64(zbuf, xlo); zbuf = _mm_add_epi64(zbuf, xhi); px += nstep; py += nstep; } } for (i = 0; i < n3; i++) { dsum += mlib_fabs((mlib_d64)(*px++) - (*py++)); } long long pz[2]; _mm_storeu_si128((__m128i *)pz, zbuf); dsum += pz[0]; dsum += pz[1]; *z = dsum; } return (MLIB_SUCCESS); }
mlib_status __mlib_VectorSumAbsDiff_S16_Sat( mlib_d64 *z, const mlib_s16 *x, const mlib_s16 *y, mlib_s32 n) { if (n <= 0) return (MLIB_FAILURE); mlib_s32 i, nstep, ax, ay, n1, n2, n3, xval, sum = 0; mlib_s16 *px = (mlib_s16 *)x, *py = (mlib_s16 *)y; __m128i zero, xbuf, ybuf, zbuf32, zbuf64, xlo, xhi, mext; zero = _mm_setzero_si128(); zbuf64 = zero; nstep = 16 / sizeof (mlib_s16); ax = (mlib_addr)x & 15; ay = (mlib_addr)y & 15; n1 = ((16 - ax) & 15) / sizeof (mlib_s16); n2 = (n - n1) / nstep; n3 = n - n1 - n2 * nstep; if (n2 < 1) { for (i = 0; i < n; i++) { xval = (mlib_s32)(*px++) - (*py++); sum += ABS_VALUE(xval); } *z = sum; } else { for (i = 0; i < n1; i++) { xval = (mlib_s32)(*px++) - (*py++); sum += ABS_VALUE(xval); } mlib_s32 nblock = n2 >> 12; mlib_s32 tail = n2 & 4095; mlib_s32 k; if (ax == ay) { for (k = 0; k < nblock; k++) { zbuf32 = zero; for (i = 0; i < 4096; i++) { xbuf = _mm_load_si128((__m128i *)px); ybuf = _mm_load_si128((__m128i *)py); mext = _mm_cmpgt_epi16(ybuf, xbuf); xbuf = _mm_sub_epi16(xbuf, ybuf); xbuf = _mm_xor_si128(xbuf, mext); xbuf = _mm_sub_epi16(xbuf, mext); xlo = _mm_unpacklo_epi16(xbuf, zero); xhi = _mm_unpackhi_epi16(xbuf, zero); zbuf32 = _mm_add_epi32(zbuf32, xlo); zbuf32 = _mm_add_epi32(zbuf32, xhi); px += nstep; py += nstep; } xlo = _mm_unpacklo_epi32(zbuf32, zero); xhi = _mm_unpackhi_epi32(zbuf32, zero); zbuf64 = _mm_add_epi64(zbuf64, xlo); zbuf64 = _mm_add_epi64(zbuf64, xhi); } zbuf32 = zero; for (i = 0; i < tail; i++) { xbuf = _mm_load_si128((__m128i *)px); ybuf = _mm_load_si128((__m128i *)py); mext = _mm_cmpgt_epi16(ybuf, xbuf); xbuf = _mm_sub_epi16(xbuf, ybuf); xbuf = _mm_xor_si128(xbuf, mext); xbuf = _mm_sub_epi16(xbuf, mext); xlo = _mm_unpacklo_epi16(xbuf, zero); xhi = _mm_unpackhi_epi16(xbuf, zero); zbuf32 = _mm_add_epi32(zbuf32, xlo); zbuf32 = _mm_add_epi32(zbuf32, xhi); px += nstep; py += nstep; } xlo = _mm_unpacklo_epi32(zbuf32, zero); xhi = _mm_unpackhi_epi32(zbuf32, zero); zbuf64 = _mm_add_epi64(zbuf64, xlo); zbuf64 = _mm_add_epi64(zbuf64, xhi); } else { /* not aligned */ for (k = 0; k < nblock; k++) { zbuf32 = zero; for (i = 0; i < 4096; i++) { xbuf = _mm_load_si128((__m128i *)px); ybuf = _mm_loadu_si128((__m128i *)py); mext = _mm_cmpgt_epi16(ybuf, xbuf); xbuf = _mm_sub_epi16(xbuf, ybuf); xbuf = _mm_xor_si128(xbuf, mext); xbuf = _mm_sub_epi16(xbuf, mext); xlo = _mm_unpacklo_epi16(xbuf, zero); xhi = _mm_unpackhi_epi16(xbuf, zero); zbuf32 = _mm_add_epi32(zbuf32, xlo); zbuf32 = _mm_add_epi32(zbuf32, xhi); px += nstep; py += nstep; } xlo = _mm_unpacklo_epi32(zbuf32, zero); xhi = _mm_unpackhi_epi32(zbuf32, zero); zbuf64 = _mm_add_epi64(zbuf64, xlo); zbuf64 = _mm_add_epi64(zbuf64, xhi); } zbuf32 = zero; for (i = 0; i < tail; i++) { xbuf = _mm_load_si128((__m128i *)px); ybuf = _mm_loadu_si128((__m128i *)py); mext = _mm_cmpgt_epi16(ybuf, xbuf); xbuf = _mm_sub_epi16(xbuf, ybuf); xbuf = _mm_xor_si128(xbuf, mext); xbuf = _mm_sub_epi16(xbuf, mext); xlo = _mm_unpacklo_epi16(xbuf, zero); xhi = _mm_unpackhi_epi16(xbuf, zero); zbuf32 = _mm_add_epi32(zbuf32, xlo); zbuf32 = _mm_add_epi32(zbuf32, xhi); px += nstep; py += nstep; } xlo = _mm_unpacklo_epi32(zbuf32, zero); xhi = _mm_unpackhi_epi32(zbuf32, zero); zbuf64 = _mm_add_epi64(zbuf64, xlo); zbuf64 = _mm_add_epi64(zbuf64, xhi); } for (i = 0; i < n3; i++) { xval = (mlib_s32)(*px++) - (*py++); sum += ABS_VALUE(xval); } mlib_d64 dsum = sum; long long pz[2]; _mm_storeu_si128((__m128i *)pz, zbuf64); dsum += pz[0]; dsum += pz[1]; *z = dsum; } return (MLIB_SUCCESS); }
mlib_status __mlib_VectorSumAbsDiff_S8_Sat( mlib_d64 *z, const mlib_s8 *x, const mlib_s8 *y, mlib_s32 n) { if (n <= 0) return (MLIB_FAILURE); mlib_s32 i, nstep, ax, ay, n1, n2, n3, diff, sum = 0; mlib_s8 *px = (mlib_s8 *)x, *py = (mlib_s8 *)y; __m128i zero, xbuf, ybuf, zbuf, mext, mbuf; zero = _mm_setzero_si128(); zbuf = zero; nstep = 16 / sizeof (mlib_s8); ax = (mlib_addr)x & 15; ay = (mlib_addr)y & 15; n1 = ((16 - ax) & 15) / sizeof (mlib_s8); n2 = (n - n1) / nstep; n3 = n - n1 - n2 * nstep; if (n2 < 1) { for (i = 0; i < n; i++) { diff = (mlib_s32)(*px++) - (*py++); sum += ABS_VALUE(diff); } *z = sum; } else { for (i = 0; i < n1; i++) { diff = (mlib_s32)(*px++) - (*py++); sum += ABS_VALUE(diff); } if (ax == ay) { for (i = 0; i < n2; i++) { xbuf = _mm_load_si128((__m128i *)px); ybuf = _mm_load_si128((__m128i *)py); mext = _mm_cmpgt_epi8(ybuf, xbuf); mbuf = _mm_sub_epi8(xbuf, ybuf); mbuf = _mm_xor_si128(mbuf, mext); mbuf = _mm_sub_epi8(mbuf, mext); mbuf = _mm_sad_epu8(mbuf, zero); zbuf = _mm_add_epi64(zbuf, mbuf); px += nstep; py += nstep; } } else { for (i = 0; i < n2; i++) { xbuf = _mm_load_si128((__m128i *)px); ybuf = _mm_loadu_si128((__m128i *)py); mext = _mm_cmpgt_epi8(ybuf, xbuf); mbuf = _mm_sub_epi8(xbuf, ybuf); mbuf = _mm_xor_si128(mbuf, mext); mbuf = _mm_sub_epi8(mbuf, mext); mbuf = _mm_sad_epu8(mbuf, zero); zbuf = _mm_add_epi64(zbuf, mbuf); px += nstep; py += nstep; } } for (i = 0; i < n3; i++) { diff = (mlib_s32)(*px++) - (*py++); sum += ABS_VALUE(diff); } mlib_d64 dsum = sum; long long pz[2]; _mm_storeu_si128((__m128i *)pz, zbuf); dsum += pz[0]; dsum += pz[1]; *z = dsum; } return (MLIB_SUCCESS); }
void av1_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, int stride) { __m128i in0, in1, in2, in3; __m128i u0, u1; __m128i sum = _mm_setzero_si128(); int i; for (i = 0; i < 2; ++i) { input += 8 * i; in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); u0 = _mm_add_epi16(in0, in1); u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); sum = _mm_add_epi16(sum, u1); u0 = _mm_add_epi16(in0, in1); u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); in0 = _mm_load_si128((const __m128i *)(input + 8 * stride)); in1 = _mm_load_si128((const __m128i *)(input + 9 * stride)); in2 = _mm_load_si128((const __m128i *)(input + 10 * stride)); in3 = _mm_load_si128((const __m128i *)(input + 11 * stride)); sum = _mm_add_epi16(sum, u1); u0 = _mm_add_epi16(in0, in1); u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); in0 = _mm_load_si128((const __m128i *)(input + 12 * stride)); in1 = _mm_load_si128((const __m128i *)(input + 13 * stride)); in2 = _mm_load_si128((const __m128i *)(input + 14 * stride)); in3 = _mm_load_si128((const __m128i *)(input + 15 * stride)); sum = _mm_add_epi16(sum, u1); u0 = _mm_add_epi16(in0, in1); u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); sum = _mm_add_epi16(sum, u1); } u0 = _mm_setzero_si128(); in0 = _mm_unpacklo_epi16(u0, sum); in1 = _mm_unpackhi_epi16(u0, sum); in0 = _mm_srai_epi32(in0, 16); in1 = _mm_srai_epi32(in1, 16); sum = _mm_add_epi32(in0, in1); in0 = _mm_unpacklo_epi32(sum, u0); in1 = _mm_unpackhi_epi32(sum, u0); sum = _mm_add_epi32(in0, in1); in0 = _mm_srli_si128(sum, 8); in1 = _mm_add_epi32(sum, in0); in1 = _mm_srai_epi32(in1, 1); store_output(&in1, output); }
static int blake64_compress( state * state, const u8 * datablock ) { __m128i row1a,row1b; __m128i row2a,row2b; __m128i row3a,row3b; __m128i row4a,row4b; __m128i buf1a,buf2a; static const u8 rot16[16] = {2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9}; __m128i r16 = _mm_load_si128((__m128i*)rot16); u64 m[16]; u64 y[16]; /* constants and permutation */ static const int sig[][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , }; static const u64 z[16] = { 0x243F6A8885A308D3ULL,0x13198A2E03707344ULL, 0xA4093822299F31D0ULL,0x082EFA98EC4E6C89ULL, 0x452821E638D01377ULL,0xBE5466CF34E90C6CULL, 0xC0AC29B7C97C50DDULL,0x3F84D5B5B5470917ULL, 0x9216D5D98979FB1BULL,0xD1310BA698DFB5ACULL, 0x2FFD72DBD01ADFB7ULL,0xB8E1AFED6A267E96ULL, 0xBA7C9045F12C7F99ULL,0x24A19947B3916CF7ULL, 0x0801F2E2858EFC16ULL,0x636920D871574E69ULL }; /* get message */ m[ 0] = U8TO64(datablock + 0); m[ 1] = U8TO64(datablock + 8); m[ 2] = U8TO64(datablock + 16); m[ 3] = U8TO64(datablock + 24); m[ 4] = U8TO64(datablock + 32); m[ 5] = U8TO64(datablock + 40); m[ 6] = U8TO64(datablock + 48); m[ 7] = U8TO64(datablock + 56); m[ 8] = U8TO64(datablock + 64); m[ 9] = U8TO64(datablock + 72); m[10] = U8TO64(datablock + 80); m[11] = U8TO64(datablock + 88); m[12] = U8TO64(datablock + 96); m[13] = U8TO64(datablock +104); m[14] = U8TO64(datablock +112); m[15] = U8TO64(datablock +120); row1b = _mm_set_epi64((__m64)state->h[3],(__m64)state->h[2]); row1a = _mm_set_epi64((__m64)state->h[1],(__m64)state->h[0]); row2b = _mm_set_epi64((__m64)state->h[7],(__m64)state->h[6]); row2a = _mm_set_epi64((__m64)state->h[5],(__m64)state->h[4]); row3b = _mm_set_epi64((__m64)0x082EFA98EC4E6C89ULL, (__m64)0xA4093822299F31D0ULL); row3a = _mm_set_epi64((__m64)0x13198A2E03707344ULL, (__m64)0x243F6A8885A308D3ULL); if (state->nullt) { row4b = _mm_set_epi64((__m64)0x3F84D5B5B5470917ULL, (__m64)0xC0AC29B7C97C50DDULL); row4a = _mm_set_epi64((__m64)0xBE5466CF34E90C6CULL, (__m64)0x452821E638D01377ULL); } else { row4b = _mm_set_epi64((__m64)(0x3F84D5B5B5470917ULL^state->t[1]), (__m64)(0xC0AC29B7C97C50DDULL^state->t[1])); row4a = _mm_set_epi64((__m64)(0xBE5466CF34E90C6CULL^state->t[0]), (__m64)(0x452821E638D01377ULL^state->t[0])); } /* initialization ok (beware of bug on Celeron and P4!) */ #define round(r)\ /* column step */\ /***************************************************/\ /* high-order side: words 0, 1, 4, 5, 8, 9, 12, 13 */ \ buf2a = _mm_set_epi64( (__m64)m[sig[r][ 2]], (__m64)m[sig[r][ 0]] ); \ buf1a = _mm_set_epi64( (__m64)z[sig[r][ 3]], (__m64)z[sig[r][ 1]] ); \ buf1a = _mm_xor_si128( buf1a, buf2a ); \ row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a ); \ row4a = _mm_xor_si128( row4a, row1a ); \ row4a = _mm_shuffle_epi32(row4a, 0xB1); \ row3a = _mm_add_epi64( row3a, row4a ); \ row2a = _mm_xor_si128( row2a, row3a ); \ row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 25 ),_mm_slli_epi64( row2a, 39 )); \ \ buf2a = _mm_set_epi64( (__m64)m[sig[r][ 3]], (__m64)m[sig[r][ 1]] ); \ buf1a = _mm_set_epi64( (__m64)z[sig[r][ 2]], (__m64)z[sig[r][ 0]] ); \ buf1a = _mm_xor_si128( buf1a, buf2a ); \ row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a ); \ row4a = _mm_xor_si128( row4a, row1a ); \ row4a = _mm_shuffle_epi8(row4a, r16); \ row3a = _mm_add_epi64( row3a, row4a ); \ row2a = _mm_xor_si128( row2a, row3a ); \ row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 11 ),_mm_slli_epi64( row2a, 53 )); \ \ /* same stuff for low-order side */\ buf2a = _mm_set_epi64( (__m64)m[sig[r][ 6]], (__m64)m[sig[r][ 4]] );\ buf1a = _mm_set_epi64( (__m64)z[sig[r][ 7]], (__m64)z[sig[r][ 5]] );\ buf1a = _mm_xor_si128( buf1a, buf2a ); \ row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b ); \ row4b = _mm_xor_si128( row4b, row1b ); \ row4b = _mm_shuffle_epi32(row4b, 0xB1); \ row3b = _mm_add_epi64( row3b, row4b ); \ row2b = _mm_xor_si128( row2b, row3b ); \ row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 25 ),_mm_slli_epi64( row2b, 39 )); \ \ buf2a = _mm_set_epi64( (__m64)m[sig[r][ 7]], (__m64)m[sig[r][ 5]] ); \ buf1a = _mm_set_epi64( (__m64)z[sig[r][ 6]], (__m64)z[sig[r][ 4]] ); \ buf1a = _mm_xor_si128( buf1a, buf2a ); \ row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b ); \ row4b = _mm_xor_si128( row4b, row1b ); \ row4b = _mm_shuffle_epi8(row4b, r16); \ row3b = _mm_add_epi64( row3b, row4b ); \ row2b = _mm_xor_si128( row2b, row3b ); \ row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 11 ),_mm_slli_epi64( row2b, 53 )); \ \ /* shuffle */\ _mm_store_si128( 0+ (__m128i *)y, row4a); \ _mm_store_si128( 1+ (__m128i *)y, row4b); \ row4a = row3a;\ row3a = row3b;\ row3b = row4a;\ row4a = _mm_set_epi64( (__m64)y[0], (__m64)y[3] );\ row4b = _mm_set_epi64( (__m64)y[2], (__m64)y[1] );\ _mm_store_si128( 0+ (__m128i *)y, row2a); \ _mm_store_si128( 1+ (__m128i *)y, row2b); \ row2a = _mm_set_epi64( (__m64)y[2], (__m64)y[1] ); \ row2b = _mm_set_epi64( (__m64)y[0], (__m64)y[3] ); \ /* diagonal step */\ /***************************************************/\ /* high-order side: words 0, 1, 4, 5, 8, 9, 12, 13 */\ buf2a = _mm_set_epi64( (__m64)m[sig[r][10]], (__m64)m[sig[r][ 8]] );\ buf1a = _mm_set_epi64( (__m64)z[sig[r][11]], (__m64)z[sig[r][ 9]] );\ buf1a = _mm_xor_si128( buf1a, buf2a );\ row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a );\ row4a = _mm_xor_si128( row4a, row1a ); \ row4a = _mm_shuffle_epi32(row4a, 0xB1); \ row3a = _mm_add_epi64( row3a, row4a ); \ row2a = _mm_xor_si128( row2a, row3a ); \ row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 25 ),_mm_slli_epi64( row2a, 39 )); \ \ buf2a = _mm_set_epi64( (__m64)m[sig[r][11]], (__m64)m[sig[r][ 9]] );\ buf1a = _mm_set_epi64( (__m64)z[sig[r][10]], (__m64)z[sig[r][ 8]] );\ buf1a = _mm_xor_si128( buf1a, buf2a );\ row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a );\ row4a = _mm_xor_si128( row4a, row1a ); \ row4a = _mm_shuffle_epi8(row4a, r16); \ row3a = _mm_add_epi64( row3a, row4a ); \ row2a = _mm_xor_si128( row2a, row3a ); \ row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 11 ),_mm_slli_epi64( row2a, 53 )); \ \ /* same stuff for low-order side */\ buf2a = _mm_set_epi64( (__m64)m[sig[r][14]], (__m64)m[sig[r][12]] );\ buf1a = _mm_set_epi64( (__m64)z[sig[r][15]], (__m64)z[sig[r][13]] );\ buf1a = _mm_xor_si128( buf1a, buf2a );\ row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b );\ row4b = _mm_xor_si128( row4b, row1b ); \ buf2a = _mm_set_epi64( (__m64)m[sig[r][15]], (__m64)m[sig[r][13]] );\ row4b = _mm_shuffle_epi32(row4b, 0xB1); \ row3b = _mm_add_epi64( row3b, row4b ); \ row2b = _mm_xor_si128( row2b, row3b ); \ buf1a = _mm_set_epi64( (__m64)z[sig[r][14]], (__m64)z[sig[r][12]] );\ row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 25 ),_mm_slli_epi64( row2b, 39 )); \ \ buf1a = _mm_xor_si128( buf1a, buf2a );\ row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b );\ row4b = _mm_xor_si128( row4b, row1b ); \ row4b = _mm_shuffle_epi8(row4b, r16); \ row3b = _mm_add_epi64( row3b, row4b ); \ row2b = _mm_xor_si128( row2b, row3b ); \ row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 11 ),_mm_slli_epi64( row2b, 53 )); \ \ /* shuffle back */\ buf1a = row3a;\ row3a = row3b;\ row3b = buf1a;\ _mm_store_si128( 0+ (__m128i *)y, row2a); \ _mm_store_si128( 1+ (__m128i *)y, row2b); \ row2a = _mm_set_epi64( (__m64)y[0], (__m64)y[3] ); \ row2b = _mm_set_epi64( (__m64)y[2], (__m64)y[1] ); \ _mm_store_si128( 0+ (__m128i *)y, row4a); \ _mm_store_si128( 1+ (__m128i *)y, row4b); \ row4a = _mm_set_epi64( (__m64)y[2], (__m64)y[1] ); \ row4b = _mm_set_epi64( (__m64)y[0], (__m64)y[3] ); \ \ round(0); round(1); round(2); round(3); round(4); round(5); round(6); round(7); round(8); round(9); round(10); round(11); round(12); round(13); row1a = _mm_xor_si128(row3a,row1a); row1b = _mm_xor_si128(row3b,row1b); _mm_store_si128( (__m128i *)m, row1a); state->h[0] ^= m[ 0]; state->h[1] ^= m[ 1]; _mm_store_si128( (__m128i *)m, row1b); state->h[2] ^= m[ 0]; state->h[3] ^= m[ 1]; row2a = _mm_xor_si128(row4a,row2a); row2b = _mm_xor_si128(row4b,row2b); _mm_store_si128( (__m128i *)m, row2a); state->h[4] ^= m[ 0]; state->h[5] ^= m[ 1]; _mm_store_si128( (__m128i *)m, row2b); state->h[6] ^= m[ 0]; state->h[7] ^= m[ 1]; return 0; }
/* Computes alpha metrics */ void map_gen_alpha(map_gen_t * s, uint32_t long_cb) { uint32_t k; int16_t *alpha = s->alpha; uint32_t i; alpha[0] = 0; for (i = 1; i < 8; i++) { alpha[i] = -INF; } /* Define the shuffle constant for the positive alpha */ __m128i shuf_ap = _mm_set_epi8( 15, 14, // 7 9, 8, // 4 7, 6, // 3 1, 0, // 0 13, 12, // 6 11, 10, // 5 5, 4, // 2 3, 2 // 1 ); /* Define the shuffle constant for the negative alpha */ __m128i shuf_an = _mm_set_epi8( 13, 12, // 6 11, 10, // 5 5, 4, // 2 3, 2, // 1 15, 14, // 7 9, 8, // 4 7, 6, // 3 1, 0 // 0 ); /* Define shuffle for branch costs */ __m128i shuf_g[4]; shuf_g[0] = _mm_set_epi8(3,2,3,2,1,0,1,0,1,0,1,0,3,2,3,2); shuf_g[1] = _mm_set_epi8(7,6,7,6,5,4,5,4,5,4,5,4,7,6,7,6); shuf_g[2] = _mm_set_epi8(11,10,11,10,9,8,9,8,9,8,9,8,11,10,11,10); shuf_g[3] = _mm_set_epi8(15,14,15,14,13,12,13,12,13,12,13,12,15,14,15,14); __m128i shuf_norm = _mm_set_epi8(1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0); __m128i* alphaPtr = (__m128i*) alpha; alphaPtr++; __m128i gv; __m128i *gPtr = (__m128i*) s->branch; __m128i g, ap, an; __m128i alpha_k = _mm_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0); /* This defines a alpha computation step: * Adds and substracts the branch metrics to the previous alpha step, * shuffles the states according to the trellis path and selects maximum state */ #define ALPHA_STEP(c) g = _mm_shuffle_epi8(gv, shuf_g[c]); \ ap = _mm_add_epi16(alpha_k, g);\ an = _mm_sub_epi16(alpha_k, g);\ ap = _mm_shuffle_epi8(ap, shuf_ap);\ an = _mm_shuffle_epi8(an, shuf_an);\ alpha_k = _mm_max_epi16(ap, an);\ _mm_store_si128(alphaPtr, alpha_k);\ alphaPtr++; \ /* In this loop, we compute 8 steps and normalize twice for each branch metrics memory load */ __m128i norm; for (k = 0; k < long_cb/8; k++) { gv = _mm_load_si128(gPtr); gPtr++; ALPHA_STEP(0); ALPHA_STEP(1); ALPHA_STEP(2); ALPHA_STEP(3); norm = _mm_shuffle_epi8(alpha_k, shuf_norm); alpha_k = _mm_sub_epi16(alpha_k, norm); gv = _mm_load_si128(gPtr); gPtr++; ALPHA_STEP(0); ALPHA_STEP(1); ALPHA_STEP(2); ALPHA_STEP(3); norm = _mm_shuffle_epi8(alpha_k, shuf_norm); alpha_k = _mm_sub_epi16(alpha_k, norm); } }
void FileIconDrawGlass::Text(HDC hdc, PCTCHAR pcszText, const RECT &rc, eTextColor eColor, UINT uFlags) { if (!pcszText || !*pcszText) return; // Find out actual size of text int nChars = _tcslen(pcszText); uFlags |= DT_NOCLIP; int iX = rc.left; int iY = rc.top; int iXW = (rc.right - iX); int iYH = (rc.bottom - iY); RECT rcMin = rc; if (DrawText(hdcTextDIB, pcszText, nChars, &rcMin, uFlags | DT_CALCRECT)) { int iMinXW = rcMin.right - rcMin.left; int iMinYH = rcMin.bottom - rcMin.top; if (iMinXW < iXW) { if (uFlags & DT_CENTER) { iX += (iXW - iMinXW)/2; uFlags &= ~DT_CENTER; } else if (uFlags & DT_RIGHT) { iX += (iXW - iMinXW); uFlags &= ~DT_RIGHT; } iXW = iMinXW; } if (iMinYH < iYH) { if (uFlags & DT_SINGLELINE) { if (uFlags & DT_VCENTER) { iY += (iYH - iMinYH)/2; uFlags &= ~DT_VCENTER; } else if (uFlags & DT_BOTTOM) { iY += (iYH - iMinYH); uFlags &= ~DT_BOTTOM; } } iYH = iMinYH; } } iXW += 2; // NB: +2 'cause we want an extra pixel at the border so that the font smoothing will look bette! iYH += 2; // Ensure we have a big enough DIB to draw the text to if ((iXW > iTextDIBXW) || (iYH > iTextDIBYH)) CreateTextDIB(iXW, iYH); if (!hbmpTextDIB) return; // Select color ieBGRA clr; switch (eColor) { case eFileName: clr = clrFileName; break; case eComment: clr = clrComment; break; case eFileInfo: clr = clrFileInfo; break; default: clr = ieBGRA(0,0,0); break; } clr.A = 0xFF - clrBkg.A; // Draw the text to in-memory DIB RECT rcTextDIB = { 0, 0, iXW, iYH }; FillRect(hdcTextDIB, &rcTextDIB, hbrBkg); rcTextDIB.left++; rcTextDIB.top++; DrawText(hdcTextDIB, pcszText, nChars, &rcTextDIB, uFlags); // Modify DIB: #ifndef __X64__ if (g_bSSE2) #endif { __m128i r0, r1, r2, r3, r4, r5, r6, r7; r7 = _mm_setzero_si128(); // 0 r6 = _mm_set1_epi32(clr.dw); // CA CR CG CB CA CR CG CB CA CR CG CB CA CR CG CB r6 = _mm_unpacklo_epi8(r7, r6); // CA<<8 CR<<8 CG<<8 CB<<8 CA<<8 CR<<8 CG<<8 CB<<8 r5 = _mm_set1_epi16(1); // 1 1 1 1 1 1 1 1 r4 = _mm_set1_epi32(0xFF); // FF FF FF FF r3 = _mm_set1_epi32(clrBkg.dw); // DA 0 0 0 DA 0 0 0 DA 0 0 0 DA 0 0 0 ieBGRA *py = pTextDIB; for (int y = iYH; y--; py += iTextDIBXW) { ieBGRA *px = py; for (int x_4 = (iXW+3)>>2; x_4--; px += 4) { r0 = _mm_load_si128((__m128i *)px); r1 = r0; r2 = r0; // X3 R3 G3 B3 X2 R2 G2 B2 X1 R1 G1 B1 X0 R0 G0 B0 r0 = _mm_srli_epi32(r0, 16); // 0 0 X3 R3 0 0 X2 R2 0 0 X1 R1 0 0 X0 R0 r1 = _mm_srli_epi32(r1, 8); // 0 X3 R3 G3 0 X2 R2 G2 0 X1 R1 G1 0 X0 R0 G0 r0 = _mm_max_epu8(r0, r2); r0 = _mm_max_epu8(r0, r1); // x x x A3 x x x A2 x x x A1 x x x A0 r0 = _mm_and_si128(r0, r4); // 0 A3 0 A2 0 A1 0 A0 r0 = _mm_shufflelo_epi16(r0, _MM_SHUFFLE(2,2,0,0)); r0 = _mm_shufflehi_epi16(r0, _MM_SHUFFLE(2,2,0,0)); // A3 A3 A2 A2 A1 A1 A0 A0 r1 = r0; r0 = _mm_unpacklo_epi32(r0, r0); // A1 A1 A1 A1 A0 A0 A0 A0 r1 = _mm_unpackhi_epi32(r1, r1); // A3 A3 A3 A3 A2 A2 A2 A2 r0 = _mm_add_epi16(r0, r5); // A1' A1' A1' A1' A0' A0' A0' A0' r1 = _mm_add_epi16(r1, r5); // A3' A3' A3' A3' A2' A2' A2' A2' r0 = _mm_mulhi_epu16(r0, r6); // xA1" xR1 xG1 xB1 xA0" xR0 xG0 xB0 r1 = _mm_mulhi_epu16(r1, r6); // xA3" xR3 xG3 xB3 xA2" xR2 xG2 xB2 r0 = _mm_packus_epi16(r0, r1); // xA3"xR3 xG3 xB3 xA2"xR2 xG2 xB2 xA1"xR1 xG1 xB1 xA0"xR0 xG0 xB0 r0 = _mm_adds_epu8(r0, r3); // xA3 xR3 xG3 xB3 xA2 xR2 xG2 xB2 xA1 xR1 xG1 xB1 xA0 xR0 xG0 xB0 _mm_store_si128((__m128i *)px, r0); } } } #ifndef __X64__ else {
/* Computes beta values */ void map_gen_beta(map_gen_t * s, int16_t * output, uint32_t long_cb) { int k; uint32_t end = long_cb + 3; const __m128i *alphaPtr = (const __m128i*) s->alpha; __m128i beta_k = _mm_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0); __m128i g, bp, bn, alpha_k; /* Define the shuffle constant for the positive beta */ __m128i shuf_bp = _mm_set_epi8( 15, 14, // 7 7, 6, // 3 5, 4, // 2 13, 12, // 6 11, 10, // 5 3, 2, // 1 1, 0, // 0 9, 8 // 4 ); /* Define the shuffle constant for the negative beta */ __m128i shuf_bn = _mm_set_epi8( 7, 6, // 3 15, 14, // 7 13, 12, // 6 5, 4, // 2 3, 2, // 1 11, 10, // 5 9, 8, // 4 1, 0 // 0 ); alphaPtr += long_cb-1; /* Define shuffle for branch costs */ __m128i shuf_g[4]; shuf_g[3] = _mm_set_epi8(3,2,1,0,1,0,3,2,3,2,1,0,1,0,3,2); shuf_g[2] = _mm_set_epi8(7,6,5,4,5,4,7,6,7,6,5,4,5,4,7,6); shuf_g[1] = _mm_set_epi8(11,10,9,8,9,8,11,10,11,10,9,8,9,8,11,10); shuf_g[0] = _mm_set_epi8(15,14,13,12,13,12,15,14,15,14,13,12,13,12,15,14); __m128i gv; int16_t *b = &s->branch[2*long_cb-8]; __m128i *gPtr = (__m128i*) b; /* Define shuffle for beta normalization */ __m128i shuf_norm = _mm_set_epi8(1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0); /* This defines a beta computation step: * Adds and substracts the branch metrics to the previous beta step, * shuffles the states according to the trellis path and selects maximum state */ #define BETA_STEP(g) bp = _mm_add_epi16(beta_k, g);\ bn = _mm_sub_epi16(beta_k, g);\ bp = _mm_shuffle_epi8(bp, shuf_bp);\ bn = _mm_shuffle_epi8(bn, shuf_bn);\ beta_k = _mm_max_epi16(bp, bn); /* Loads the alpha metrics from memory and adds them to the temporal bn and bp * metrics. Then computes horizontal maximum of both metrics and computes difference */ #define BETA_STEP_CNT(c,d) g = _mm_shuffle_epi8(gv, shuf_g[c]);\ BETA_STEP(g)\ alpha_k = _mm_load_si128(alphaPtr);\ alphaPtr--;\ bp = _mm_add_epi16(bp, alpha_k);\ bn = _mm_add_epi16(bn, alpha_k); output[k-d] = hMax(bn) - hMax(bp); /* The tail does not require to load alpha or produce outputs. Only update * beta metrics accordingly */ for (k=end-1; k>=long_cb; k--) { int16_t g0 = s->branch[2*k]; int16_t g1 = s->branch[2*k+1]; g = _mm_set_epi16(g1, g0, g0, g1, g1, g0, g0, g1); BETA_STEP(g); } /* We inline 2 trelis steps for each normalization */ __m128i norm; for (; k >= 0; k-=8) { gv = _mm_load_si128(gPtr); gPtr--; BETA_STEP_CNT(0,0); BETA_STEP_CNT(1,1); BETA_STEP_CNT(2,2); BETA_STEP_CNT(3,3); norm = _mm_shuffle_epi8(beta_k, shuf_norm); beta_k = _mm_sub_epi16(beta_k, norm); gv = _mm_load_si128(gPtr); gPtr--; BETA_STEP_CNT(0,4); BETA_STEP_CNT(1,5); BETA_STEP_CNT(2,6); BETA_STEP_CNT(3,7); norm = _mm_shuffle_epi8(beta_k, shuf_norm); beta_k = _mm_sub_epi16(beta_k, norm); } }
// @return true iff the two pages differ; false otherwise. // @note Uses SSE3, so you must compile with -msse3. bool pagesDifferent (const void * b1, const void * b2) { enum { PAGE_SIZE = 4096 }; // Make a mask, initially all 1's. register __m128i mask = _mm_setzero_si128(); mask = _mm_cmpeq_epi32(mask, mask); __m128i * buf1 = (__m128i *) b1; __m128i * buf2 = (__m128i *) b2; // Some vectorizing pragamata here; not sure if gcc implements them. #pragma vector always for (int i = 0; i < PAGE_SIZE / sizeof(__m128i); i += 8) { #pragma ivdep #pragma vector aligned register __m128i xmm1, xmm2; // Unrolled loop for speed: we load two 128-bit chunks, // and logically AND in their comparison. // If the mask gets any zero bits, the bytes differ. xmm1 = _mm_load_si128 (&buf1[i]); xmm2 = _mm_load_si128 (&buf2[i]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); xmm1 = _mm_load_si128 (&buf1[i+1]); xmm2 = _mm_load_si128 (&buf2[i+1]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); xmm1 = _mm_load_si128 (&buf1[i+2]); xmm2 = _mm_load_si128 (&buf2[i+2]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); xmm1 = _mm_load_si128 (&buf1[i+3]); xmm2 = _mm_load_si128 (&buf2[i+3]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); xmm1 = _mm_load_si128 (&buf1[i+4]); xmm2 = _mm_load_si128 (&buf2[i+4]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); xmm1 = _mm_load_si128 (&buf1[i+5]); xmm2 = _mm_load_si128 (&buf2[i+5]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); xmm1 = _mm_load_si128 (&buf1[i+6]); xmm2 = _mm_load_si128 (&buf2[i+6]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); xmm1 = _mm_load_si128 (&buf1[i+7]); xmm2 = _mm_load_si128 (&buf2[i+7]); mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2)); // Save the mask to see whether we have found a difference or not. unsigned long long buf[128 / sizeof(unsigned long long) / 8] __attribute__((aligned(16))); _mm_store_si128 ((__m128i *) &buf, mask); // IMPORTANT: make sure long long = 64bits! enum { VERIFY_LONGLONG_64 = 1 / (sizeof(long long) == 8) }; // Now check the result. // Both buf[0] and buf[1] should be all ones. if ((buf[0] != (unsigned long long) -1) || (buf[1] != (unsigned long long) -1)) { return true; } } // No differences found. return false; }
void lp_rast_triangle_3_4(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) { const struct lp_rast_triangle *tri = arg.triangle.tri; const struct lp_rast_plane *plane = GET_PLANES(tri); unsigned x = (arg.triangle.plane_mask & 0xff) + task->x; unsigned y = (arg.triangle.plane_mask >> 8) + task->y; __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */ __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */ __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */ __m128i zero = _mm_setzero_si128(); __m128i c; __m128i dcdx; __m128i dcdy; __m128i dcdx2; __m128i dcdx3; __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ __m128i unused; transpose4_epi32(&p0, &p1, &p2, &zero, &c, &dcdx, &dcdy, &unused); /* Adjust dcdx; */ dcdx = _mm_sub_epi32(zero, dcdx); c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */ c = _mm_sub_epi32(c, _mm_set1_epi32(1)); dcdx2 = _mm_add_epi32(dcdx, dcdx); dcdx3 = _mm_add_epi32(dcdx2, dcdx); transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, &span_0, &span_1, &span_2, &unused); { __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0); __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1); __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2); __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0); __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0)); __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1)); __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2)); __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1); __m128i c_01 = _mm_packs_epi32(c_0, c_1); __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0)); __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1)); __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2)); __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2); __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0)); __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1)); __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2)); __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3); __m128i c_23 = _mm_packs_epi32(c_2, c_3); __m128i c_0123 = _mm_packs_epi16(c_01, c_23); unsigned mask = _mm_movemask_epi8(c_0123); if (mask != 0xffff) lp_rast_shade_quads_mask(task, &tri->inputs, x, y, 0xffff & ~mask); } }
ConnectedComponent::ConnectedComponent( std::array<char, 8> value, boost::shared_ptr<pixel_list_type> pixelList, pixel_list_type::const_iterator begin, pixel_list_type::const_iterator end) : _pixels(pixelList), _value(value), _boundingBox(0, 0, 0, 0), _center(0, 0), _centerDirty(true), _pixelRange(begin, end), _bitmapDirty(true) { #ifdef __SSE4_1__ // if there is at least one pixel if (begin != end) { unsigned int*__restrict__ pixels = (unsigned int*)&*begin; unsigned int*__restrict__ pixelsEnd = (unsigned int*)&*end; // Prepare aligned, packed integer values. typedef union { __m128i v; unsigned int a[4]; } xmm_uints; enum {X1, Y1, X2, Y2}; __attribute__((aligned(16))) xmm_uints mins1; __attribute__((aligned(16))) xmm_uints maxs1; mins1.a[X1] = begin->x(); maxs1.a[X1] = begin->x(); mins1.a[Y1] = begin->y(); maxs1.a[Y1] = begin->y(); // Iterate through pixelList until 16-byte alignment is reached. while (((std::uintptr_t) pixels % 16) != 0 && pixels < pixelsEnd) { unsigned int x = pixels[X1]; unsigned int y = pixels[Y1]; mins1.a[X1] = std::min(mins1.a[X1], x); mins1.a[Y1] = std::min(mins1.a[Y1], y); maxs1.a[X1] = std::max(maxs1.a[X1], x); maxs1.a[Y1] = std::max(maxs1.a[Y1], y); pixels += 2; } // Guaranteed to have at least 8 XMM registers, so use 4 for cumulative // values and 2 for vector values. (Using 8+4 of 16 registers on 64-bit // arch yields no performance improvement.) mins1.a[X2] = mins1.a[X1]; mins1.a[Y2] = mins1.a[Y1]; maxs1.a[X2] = maxs1.a[X1]; maxs1.a[Y2] = maxs1.a[Y1]; __m128i mins2 = mins1.v; __m128i maxs2 = maxs1.v; // Vectorized loop. Strides two packed integer vectors, each containing // both X and Y for two pixels. while (pixels < pixelsEnd - 8) { __m128i pixelPair1 = _mm_load_si128((__m128i*)pixels); __m128i pixelPair2 = _mm_load_si128((__m128i*)(pixels + 4)); pixels += 8; // Hint compiler to iterate while loads stall. _mm_prefetch(pixels, _MM_HINT_T0); mins1.v = _mm_min_epu32(mins1.v, pixelPair1); maxs1.v = _mm_max_epu32(maxs1.v, pixelPair1); mins2 = _mm_min_epu32(mins2, pixelPair2); maxs2 = _mm_max_epu32(maxs2, pixelPair2); } // Combine stride results. mins1.v = _mm_min_epu32(mins1.v, mins2); maxs1.v = _mm_max_epu32(maxs1.v, maxs2); // Iterate through any remaining pixels. while (pixels < pixelsEnd) { unsigned int x = pixels[X1]; unsigned int y = pixels[Y1]; mins1.a[X1] = std::min(mins1.a[X1], x); mins1.a[Y1] = std::min(mins1.a[Y1], y); maxs1.a[X1] = std::max(maxs1.a[X1], x); maxs1.a[Y1] = std::max(maxs1.a[Y1], y); pixels += 2; } // Readout packed vectors, compare with remaining results, and store. _boundingBox.min().x() = (int)std::min(mins1.a[X1], mins1.a[X2]); _boundingBox.min().y() = (int)std::min(mins1.a[Y1], mins1.a[Y2]); _boundingBox.max().x() = (int)std::max(maxs1.a[X1], maxs1.a[X2]) + 1; _boundingBox.max().y() = (int)std::max(maxs1.a[Y1], maxs1.a[Y2]) + 1; }
pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi) { int lastRow, lastCol; BYTE *UData,*VData,*YData; int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV; __m128i r0,r1,r2,r3,r4,r5,r6,r7; __m128i *buffer; /* last_line: if the last (U,V doubled) line should be skipped, set to 10B * last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */ buffer = _aligned_malloc(4 * 16, 16); YData = (BYTE*) pSrc[0]; UData = (BYTE*) pSrc[1]; VData = (BYTE*) pSrc[2]; nWidth = roi->width; nHeight = roi->height; if ((lastCol = (nWidth & 3))) { switch (lastCol) { case 1: r7 = _mm_set_epi32(0,0,0,0xFFFFFFFF); break; case 2: r7 = _mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF); break; case 3: r7 = _mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF); break; } _mm_store_si128(buffer+3,r7); lastCol = 1; } nWidth += 3; nWidth = nWidth >> 2; lastRow = nHeight & 1; nHeight++; nHeight = nHeight >> 1; VaddDst = (dstStep << 1) - (nWidth << 4); VaddY = (srcStep[0] << 1) - (nWidth << 2); VaddU = srcStep[1] - (((nWidth << 1) + 2) & 0xFFFC); VaddV = srcStep[2] - (((nWidth << 1) + 2) & 0xFFFC); while (nHeight-- > 0) { if (nHeight == 0) lastRow <<= 1; i = 0; do { if (!(i & 0x01)) { /* Y-, U- and V-data is stored in different arrays. * We start with processing U-data. * * at first we fetch four U-values from its array and shuffle them like this: * 0d0d 0c0c 0b0b 0a0a * we've done two things: converting the values to signed words and duplicating * each value, because always two pixel "share" the same U- (and V-) data */ r0 = _mm_cvtsi32_si128(*(UINT32 *)UData); r5 = _mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000); r0 = _mm_shuffle_epi8(r0,r5); UData += 4; /* then we subtract 128 from each value, so we get D */ r3 = _mm_set_epi16(128,128,128,128,128,128,128,128); r0 = _mm_subs_epi16(r0,r3); /* we need to do two things with our D, so let's store it for later use */ r2 = r0; /* now we can multiply our D with 48 and unpack it to xmm4:xmm0 * this is what we need to get G data later on */ r4 = r0; r7 = _mm_set_epi16(48,48,48,48,48,48,48,48); r0 = _mm_mullo_epi16(r0,r7); r4 = _mm_mulhi_epi16(r4,r7); r7 = r0; r0 = _mm_unpacklo_epi16(r0,r4); r4 = _mm_unpackhi_epi16(r7,r4); /* to get B data, we need to prepare a second value, D*475 */ r1 = r2; r7 = _mm_set_epi16(475,475,475,475,475,475,475,475); r1 = _mm_mullo_epi16(r1,r7); r2 = _mm_mulhi_epi16(r2,r7); r7 = r1; r1 = _mm_unpacklo_epi16(r1,r2); r7 = _mm_unpackhi_epi16(r7,r2); /* so we got something like this: xmm7:xmm1 * this pair contains values for 16 pixel: * aabbccdd * aabbccdd, but we can only work on four pixel at once, so we need to save upper values */ _mm_store_si128(buffer+1,r7); /* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */ r2 = _mm_cvtsi32_si128(*(UINT32 *)VData); r2 = _mm_shuffle_epi8(r2,r5); VData += 4; r2 = _mm_subs_epi16(r2,r3); r5 = r2; /* this is also known as E*403, we need it to convert R data */ r3 = r2; r7 = _mm_set_epi16(403,403,403,403,403,403,403,403); r2 = _mm_mullo_epi16(r2,r7); r3 = _mm_mulhi_epi16(r3,r7); r7 = r2; r2 = _mm_unpacklo_epi16(r2,r3); r7 = _mm_unpackhi_epi16(r7,r3); /* and preserve upper four values for future ... */ _mm_store_si128(buffer+2,r7); /* doing this step: E*120 */ r3 = r5; r7 = _mm_set_epi16(120,120,120,120,120,120,120,120); r3 = _mm_mullo_epi16(r3,r7); r5 = _mm_mulhi_epi16(r5,r7); r7 = r3; r3 = _mm_unpacklo_epi16(r3,r5); r7 = _mm_unpackhi_epi16(r7,r5); /* now we complete what we've begun above: * (48*D) + (120*E) = (48*D +120*E) */ r0 = _mm_add_epi32(r0,r3); r4 = _mm_add_epi32(r4,r7); /* and store to memory ! */ _mm_store_si128(buffer,r4); } else { /* maybe you've wondered about the conditional above ? * Well, we prepared UV data for eight pixel in each line, but can only process four * per loop. So we need to load the upper four pixel data from memory each secound loop! */ r1 = _mm_load_si128(buffer+1); r2 = _mm_load_si128(buffer+2); r0 = _mm_load_si128(buffer); } if (++i == nWidth) lastCol <<= 1; /* We didn't produce any output yet, so let's do so! * Ok, fetch four pixel from the Y-data array and shuffle them like this: * 00d0 00c0 00b0 00a0, to get signed dwords and multiply by 256 */ r4 = _mm_cvtsi32_si128(*(UINT32 *)YData); r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); r4 = _mm_shuffle_epi8(r4,r7); r5 = r4; r6 = r4; /* no we can perform the "real" conversion itself and produce output! */ r4 = _mm_add_epi32(r4,r2); r5 = _mm_sub_epi32(r5,r0); r6 = _mm_add_epi32(r6,r1); /* in the end, we only need bytes for RGB values. * So, what do we do? right! shifting left makes values bigger and thats always good. * before we had dwords of data, and by shifting left and treating the result * as packed words, we get not only signed words, but do also divide by 256 * imagine, data is now ordered this way: ddx0 ccx0 bbx0 aax0, and x is the least * significant byte, that we don't need anymore, because we've done some rounding */ r4 = _mm_slli_epi32(r4,8); r5 = _mm_slli_epi32(r5,8); r6 = _mm_slli_epi32(r6,8); /* one thing we still have to face is the clip() function ... * we have still signed words, and there are those min/max instructions in SSE2 ... * the max instruction takes always the bigger of the two operands and stores it in the first one, * and it operates with signs ! * if we feed it with our values and zeros, it takes the zeros if our values are smaller than * zero and otherwise our values */ r7 = _mm_set_epi32(0,0,0,0); r4 = _mm_max_epi16(r4,r7); r5 = _mm_max_epi16(r5,r7); r6 = _mm_max_epi16(r6,r7); /* the same thing just completely different can be used to limit our values to 255, * but now using the min instruction and 255s */ r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_min_epi16(r4,r7); r5 = _mm_min_epi16(r5,r7); r6 = _mm_min_epi16(r6,r7); /* Now we got our bytes. * the moment has come to assemble the three channels R,G and B to the xrgb dwords * on Red channel we just have to and each futural dword with 00FF0000H */ //r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_and_si128(r4,r7); /* on Green channel we have to shuffle somehow, so we get something like this: * 00d0 00c0 00b0 00a0 */ r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); r5 = _mm_shuffle_epi8(r5,r7); /* and on Blue channel that one: * 000d 000c 000b 000a */ r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); r6 = _mm_shuffle_epi8(r6,r7); /* and at last we or it together and get this one: * xrgb xrgb xrgb xrgb */ r4 = _mm_or_si128(r4,r5); r4 = _mm_or_si128(r4,r6); /* Only thing to do know is writing data to memory, but this gets a bit more * complicated if the width is not a multiple of four and it is the last column in line. */ if (lastCol & 0x02) { /* let's say, we need to only convert six pixel in width * Ok, the first 4 pixel will be converted just like every 4 pixel else, but * if it's the last loop in line, last_column is shifted left by one (curious? have a look above), * and we land here. Through initialisation a mask was prepared. In this case it looks like * 0000FFFFH 0000FFFFH 0000FFFFH 0000FFFFH */ r6 = _mm_load_si128(buffer+3); /* we and our output data with this mask to get only the valid pixel */ r4 = _mm_and_si128(r4,r6); /* then we fetch memory from the destination array ... */ r5 = _mm_lddqu_si128((__m128i *)pDst); /* ... and and it with the inverse mask. We get only those pixel, which should not be updated */ r6 = _mm_andnot_si128(r6,r5); /* we only have to or the two values together and write it back to the destination array, * and only the pixel that should be updated really get changed. */ r4 = _mm_or_si128(r4,r6); } _mm_storeu_si128((__m128i *)pDst,r4); if (!(lastRow & 0x02)) { /* Because UV data is the same for two lines, we can process the secound line just here, * in the same loop. Only thing we need to do is to add some offsets to the Y- and destination * pointer. These offsets are iStride[0] and the target scanline. * But if we don't need to process the secound line, like if we are in the last line of processing nine lines, * we just skip all this. */ r4 = _mm_cvtsi32_si128(*(UINT32 *)(YData+srcStep[0])); r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); r4 = _mm_shuffle_epi8(r4,r7); r5 = r4; r6 = r4; r4 = _mm_add_epi32(r4,r2); r5 = _mm_sub_epi32(r5,r0); r6 = _mm_add_epi32(r6,r1); r4 = _mm_slli_epi32(r4,8); r5 = _mm_slli_epi32(r5,8); r6 = _mm_slli_epi32(r6,8); r7 = _mm_set_epi32(0,0,0,0); r4 = _mm_max_epi16(r4,r7); r5 = _mm_max_epi16(r5,r7); r6 = _mm_max_epi16(r6,r7); r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_min_epi16(r4,r7); r5 = _mm_min_epi16(r5,r7); r6 = _mm_min_epi16(r6,r7); r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_and_si128(r4,r7); r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); r5 = _mm_shuffle_epi8(r5,r7); r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); r6 = _mm_shuffle_epi8(r6,r7); r4 = _mm_or_si128(r4,r5); r4 = _mm_or_si128(r4,r6); if (lastCol & 0x02) { r6 = _mm_load_si128(buffer+3); r4 = _mm_and_si128(r4,r6); r5 = _mm_lddqu_si128((__m128i *)(pDst+dstStep)); r6 = _mm_andnot_si128(r6,r5); r4 = _mm_or_si128(r4,r6); /* only thing is, we should shift [rbp-42] back here, because we have processed the last column, * and this "special condition" can be released */ lastCol >>= 1; } _mm_storeu_si128((__m128i *)(pDst+dstStep),r4); } /* after all we have to increase the destination- and Y-data pointer by four pixel */ pDst += 16; YData += 4; }
FOLLY_DISABLE_SANITIZERS __m128i _mm_load_si128_nosan(__m128i const* const p) { return _mm_load_si128(p); }
inline void prep_dv(__m128i* idx, __m128i& v, __m128& n) { v = _mm_load_si128(idx); n = _mm_cvtepi32_ps(v); }
static inline int blake512_compress( state * state, const u8 * datablock ) { __m128i row1l; __m128i row2l; __m128i row3l; __m128i row4l; u64 row1hl, row1hh; u64 row2hl, row2hh; u64 row3hl, row3hh; u64 row4hl, row4hh; const __m128i r16 = _mm_setr_epi8(2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9); const __m128i u8to64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7); union { __m128i u128[8]; u64 u64[16]; } m; __m128i t0, t1, t2, t3, t4, t5, t6, t7; u64 u0, u1, u2, u3; __m128i b0; u64 b1l, b1h; m.u128[0] = _mm_loadu_si128((__m128i*)(datablock + 0)); m.u128[1] = _mm_loadu_si128((__m128i*)(datablock + 16)); m.u128[2] = _mm_loadu_si128((__m128i*)(datablock + 32)); m.u128[3] = _mm_loadu_si128((__m128i*)(datablock + 48)); m.u128[4] = _mm_loadu_si128((__m128i*)(datablock + 64)); m.u128[5] = _mm_loadu_si128((__m128i*)(datablock + 80)); m.u128[6] = _mm_loadu_si128((__m128i*)(datablock + 96)); m.u128[7] = _mm_loadu_si128((__m128i*)(datablock + 112)); m.u128[0] = BSWAP64(m.u128[0]); m.u128[1] = BSWAP64(m.u128[1]); m.u128[2] = BSWAP64(m.u128[2]); m.u128[3] = BSWAP64(m.u128[3]); m.u128[4] = BSWAP64(m.u128[4]); m.u128[5] = BSWAP64(m.u128[5]); m.u128[6] = BSWAP64(m.u128[6]); m.u128[7] = BSWAP64(m.u128[7]); row1l = _mm_load_si128((__m128i*)&state->h[0]); row1hl = state->h[2]; row1hh = state->h[3]; row2l = _mm_load_si128((__m128i*)&state->h[4]); row2hl = state->h[6]; row2hh = state->h[7]; row3l = _mm_set_epi64x(0x13198A2E03707344ULL, 0x243F6A8885A308D3ULL); row3hl = 0xA4093822299F31D0ULL; row3hh = 0x082EFA98EC4E6C89ULL; row4l = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x452821E638D01377ULL); row4hl = 0xC0AC29B7C97C50DDULL; row4hh = 0x3F84D5B5B5470917ULL; if(!state->nullt) { row4l = _mm_xor_si128(row4l, _mm_set1_epi64x(state->t[0])); row4hl ^= state->t[1]; row4hh ^= state->t[1]; } ROUND( 0); ROUND( 1); ROUND( 2); ROUND( 3); ROUND( 4); ROUND( 5); ROUND( 6); ROUND( 7); ROUND( 8); ROUND( 9); ROUND(10); ROUND(11); ROUND(12); ROUND(13); ROUND(14); ROUND(15); row1l = _mm_xor_si128(row3l,row1l); row1hl ^= row3hl; row1hh ^= row3hh; _mm_store_si128((__m128i*)&state->h[0], _mm_xor_si128(row1l, _mm_load_si128((__m128i*)&state->h[0]))); state->h[2] ^= row1hl; state->h[3] ^= row1hh; row2l = _mm_xor_si128(row4l,row2l); row2hl ^= row4hl; row2hh ^= row4hh; _mm_store_si128((__m128i*)&state->h[4], _mm_xor_si128(row2l, _mm_load_si128((__m128i*)&state->h[4]))); state->h[6] ^= row2hl; state->h[7] ^= row2hh; return 0; }
void fb_slvn_low(dig_t *c, const dig_t *a) { int i; dig_t *p, u0, u1, u2, u3; void *tab = fb_poly_get_slv(); __m128i m0, m1, m2, m3, m4, sqrt0, sqrt1, mask0, mask1, mask2, r0, r1, t0, t1, perm; perm = _mm_set_epi32(0x0F0D0B09, 0x07050301, 0x0E0C0A08, 0x06040200); mask2 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000); mask1 = _mm_set_epi32(0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0); mask0 = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F); sqrt0 = _mm_set_epi32(0x03020302, 0x01000100, 0x03020302, 0x01000100); sqrt1 = _mm_set_epi32(0x0c080c08, 0x04000400, 0x0c080c08, 0x04000400); t0 = _mm_load_si128((__m128i *)a); t1 = _mm_load_si128((__m128i *)(a + 2)); r0 = r1 = _mm_setzero_si128(); m0 = _mm_shuffle_epi8(t1, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_slli_si128(m1, 8); m1 = _mm_and_si128(m1, mask2); m1 = _mm_slli_epi64(m1, 4); m1 = _mm_xor_si128(m1, m2); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m0 = _mm_and_si128(t0, mask2); m0 = _mm_shuffle_epi8(m0, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_srli_si128(m1, 8); m1 = _mm_andnot_si128(mask2, m1); m2 = _mm_slli_epi64(m2, 4); m1 = _mm_xor_si128(m1, m2); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_si128(t0, 4); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0xFFFFFFFF)); m0 = _mm_shuffle_epi8(m1, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_slli_si128(m1, 8); m1 = _mm_slli_epi64(m1, 4); m1 = _mm_xor_si128(m1, m2); m1 = _mm_srli_si128(m1, 6); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_si128(t0, 2); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0xFFFF)); m0 = _mm_shuffle_epi8(m1, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_slli_si128(m1, 8); m1 = _mm_slli_epi64(m1, 4); m1 = _mm_xor_si128(m1, m2); m1 = _mm_srli_si128(m1, 7); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_si128(t0, 1); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x55)); m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 1)); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x33)); m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 2)); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x0F)); m1 = _mm_slli_epi64(m1, 4); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_epi64(t0, 4); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x5)); m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 1)); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x3)); m1 = _mm_slli_epi64(m1, 2); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_epi64(t0, 2); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x1)); m1 = _mm_slli_epi64(m1, 1); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); sqrt0 = _mm_set_epi32(0x03030202, 0x03030202, 0x01010000, 0x01010000); sqrt1 = _mm_set_epi32(0x0C0C0808, 0x0C0C0808, 0x04040000, 0x04040000); m1 = _mm_and_si128(t0, mask0); m2 = _mm_and_si128(t0, mask1); m3 = _mm_and_si128(t1, mask0); m4 = _mm_and_si128(t1, mask1); m2 = _mm_srli_epi64(m2, 4); m4 = _mm_srli_epi64(m4, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m4 = _mm_shuffle_epi8(sqrt1, m4); m3 = _mm_shuffle_epi8(sqrt0, m3); m1 = _mm_or_si128(m1, m2); m3 = _mm_or_si128(m3, m4); #ifndef __PCLMUL__ align dig_t x[2]; _mm_store_si128((__m128i *)x, m1); u0 = x[0]; u1 = x[1]; _mm_store_si128((__m128i *)x, m3); u2 = x[0]; u3 = x[1]; #else u0 = _mm_extract_epi64(m1, 0); u1 = _mm_extract_epi64(m1, 1); u2 = _mm_extract_epi64(m3, 0); u3 = _mm_extract_epi64(m3, 1); #endif for (i = 0; i < 8; i++) { p = (dig_t *)(tab + (16 * i + (u0 & 0x0F)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u0 >>= 8; p = (dig_t *)(tab + (16 * (i + 8) + (u1 & 0x0F)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u1 >>= 8; p = (dig_t *)(tab + (16 * (i + 16) + (u2 & 0x0F)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u2 >>= 8; p = (dig_t *)(tab + (16 * (i + 24) + (u3 & 0xF)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u3 >>= 8; } _mm_store_si128((__m128i *)c, r0); _mm_store_si128((__m128i *)(c + 2), r1); }
void av1_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, int stride) { __m128i in0, in1, in2, in3; __m128i u0, u1; __m128i sum = _mm_setzero_si128(); int i; for (i = 0; i < 8; ++i) { in0 = _mm_load_si128((const __m128i *)(input + 0)); in1 = _mm_load_si128((const __m128i *)(input + 8)); in2 = _mm_load_si128((const __m128i *)(input + 16)); in3 = _mm_load_si128((const __m128i *)(input + 24)); input += stride; u0 = _mm_add_epi16(in0, in1); u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); in0 = _mm_load_si128((const __m128i *)(input + 0)); in1 = _mm_load_si128((const __m128i *)(input + 8)); in2 = _mm_load_si128((const __m128i *)(input + 16)); in3 = _mm_load_si128((const __m128i *)(input + 24)); input += stride; sum = _mm_add_epi16(sum, u1); u0 = _mm_add_epi16(in0, in1); u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); in0 = _mm_load_si128((const __m128i *)(input + 0)); in1 = _mm_load_si128((const __m128i *)(input + 8)); in2 = _mm_load_si128((const __m128i *)(input + 16)); in3 = _mm_load_si128((const __m128i *)(input + 24)); input += stride; sum = _mm_add_epi16(sum, u1); u0 = _mm_add_epi16(in0, in1); u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); in0 = _mm_load_si128((const __m128i *)(input + 0)); in1 = _mm_load_si128((const __m128i *)(input + 8)); in2 = _mm_load_si128((const __m128i *)(input + 16)); in3 = _mm_load_si128((const __m128i *)(input + 24)); input += stride; sum = _mm_add_epi16(sum, u1); u0 = _mm_add_epi16(in0, in1); u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); sum = _mm_add_epi16(sum, u1); } u0 = _mm_setzero_si128(); in0 = _mm_unpacklo_epi16(u0, sum); in1 = _mm_unpackhi_epi16(u0, sum); in0 = _mm_srai_epi32(in0, 16); in1 = _mm_srai_epi32(in1, 16); sum = _mm_add_epi32(in0, in1); in0 = _mm_unpacklo_epi32(sum, u0); in1 = _mm_unpackhi_epi32(sum, u0); sum = _mm_add_epi32(in0, in1); in0 = _mm_srli_si128(sum, 8); in1 = _mm_add_epi32(sum, in0); in1 = _mm_srai_epi32(in1, 3); store_output(&in1, output); }
rfx_dwt_2d_decode_block_vert_sse2(INT16* l, INT16* h, INT16* dst, int subband_width) { int x, n; INT16* l_ptr = l; INT16* h_ptr = h; INT16* dst_ptr = dst; __m128i l_n; __m128i h_n; __m128i tmp_n; __m128i h_n_m; __m128i dst_n; __m128i dst_n_m; __m128i dst_n_p; int total_width = subband_width + subband_width; /* Even coefficients */ for (n = 0; n < subband_width; n++) { for (x = 0; x < total_width; x+=8) { /* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */ l_n = _mm_load_si128((__m128i*) l_ptr); h_n = _mm_load_si128((__m128i*) h_ptr); tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));; if (n == 0) tmp_n = _mm_add_epi16(tmp_n, h_n); else { h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - total_width)); tmp_n = _mm_add_epi16(tmp_n, h_n_m); } tmp_n = _mm_srai_epi16(tmp_n, 1); dst_n = _mm_sub_epi16(l_n, tmp_n); _mm_store_si128((__m128i*) dst_ptr, dst_n); l_ptr+=8; h_ptr+=8; dst_ptr+=8; } dst_ptr+=total_width; } h_ptr = h; dst_ptr = dst + total_width; /* Odd coefficients */ for (n = 0; n < subband_width; n++) { for (x = 0; x < total_width; x+=8) { /* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */ h_n = _mm_load_si128((__m128i*) h_ptr); dst_n_m = _mm_load_si128((__m128i*) (dst_ptr - total_width)); h_n = _mm_slli_epi16(h_n, 1); tmp_n = dst_n_m; if (n == subband_width - 1) tmp_n = _mm_add_epi16(tmp_n, dst_n_m); else { dst_n_p = _mm_loadu_si128((__m128i*) (dst_ptr + total_width)); tmp_n = _mm_add_epi16(tmp_n, dst_n_p); } tmp_n = _mm_srai_epi16(tmp_n, 1); dst_n = _mm_add_epi16(tmp_n, h_n); _mm_store_si128((__m128i*) dst_ptr, dst_n); h_ptr+=8; dst_ptr+=8; } dst_ptr+=total_width; } }
void png_read_filter_row_avg3_sse(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_size_t i; png_bytep rp = row; png_const_bytep prp = prev_row; __m128i nrb = _mm_load_si128((__m128i*)(rp)); __m128i pixel = _mm_setzero_si128(); const __m128i mask = _mm_set1_epi8(0x01); for (i = 0; i < row_info->rowbytes; i += 15, rp += 15, prp += 15) { #ifndef __SSSE3__ __m128i prb = _mm_loadu_si128((__m128i*)prp); #else __m128i prb = _mm_lddqu_si128((__m128i*)prp); #endif __m128i rb = nrb; // First pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); prb = _mm_srli_si128(prb, 3); #ifndef __SSSE3__ rb = _mm_srli_si128(rb, 3); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13)); #else rb = _mm_alignr_epi8(pixel, rb, 3); #endif // Second pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); prb = _mm_srli_si128(prb, 3); #ifndef __SSSE3__ rb = _mm_srli_si128(rb, 3); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13)); #else rb = _mm_alignr_epi8(pixel, rb, 3); #endif // Third pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); prb = _mm_srli_si128(prb, 3); #ifndef __SSSE3__ rb = _mm_srli_si128(rb, 3); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13)); #else rb = _mm_alignr_epi8(pixel, rb, 3); #endif // Fourth pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); prb = _mm_srli_si128(prb, 3); #ifndef __SSSE3__ rb = _mm_srli_si128(rb, 3); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13)); #else rb = _mm_alignr_epi8(pixel, rb, 3); #endif // Fifth pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); #ifndef __SSSE3__ nrb = _mm_loadu_si128((__m128i*)(rp + 15)); rb = _mm_srli_si128(rb, 3); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13)); #else nrb = _mm_lddqu_si128((__m128i*)(rp + 15)); rb = _mm_alignr_epi8(pixel, rb, 3); #endif rb = _mm_srli_si128(rb, 1); _mm_storeu_si128((__m128i*)rp, rb); } }