template<> void copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size) { for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep ) { const uchar* src = (const uchar*)_src; uchar* dst = (uchar*)_dst; int x = 0; #if CV_SSE4_2 if(USE_SSE4_2)// { __m128i zero = _mm_setzero_si128 (); for( ; x <= size.width - 16; x += 16 ) { const __m128i rSrc = _mm_lddqu_si128((const __m128i*)(src+x)); __m128i _mask = _mm_lddqu_si128((const __m128i*)(mask+x)); __m128i rDst = _mm_lddqu_si128((__m128i*)(dst+x)); __m128i _negMask = _mm_cmpeq_epi8(_mask, zero); rDst = _mm_blendv_epi8(rSrc, rDst, _negMask); _mm_storeu_si128((__m128i*)(dst + x), rDst); } } #endif for( ; x < size.width; x++ ) if( mask[x] ) dst[x] = src[x]; } }
static void sse3_test_lddqu (double *i1, double *r) { __m128i t1 = _mm_lddqu_si128 ((__m128i *) i1); _mm_storeu_si128 ((__m128i *) r, t1); }
static INLINE unsigned int highbd_masked_sad16xh_avx2( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int x, y; __m256i res = _mm256_setzero_si256(); const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m256i round_const = _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m256i one = _mm256_set1_epi16(1); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 16) { const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); // Zero-extend mask to 16 bits const __m256i m = _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x])); const __m256i m_inv = _mm256_sub_epi16(mask_max, m); const __m256i data_l = _mm256_unpacklo_epi16(a, b); const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv); __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m256i data_r = _mm256_unpackhi_epi16(a, b); const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv); __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, // so it is safe to do signed saturation here. const __m256i pred = _mm256_packs_epi32(pred_l, pred_r); // There is no 16-bit SAD instruction, so we have to synthesize // an 8-element SAD. We do this by storing 4 32-bit partial SADs, // and accumulating them at the end const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src)); res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // At this point, we have four 32-bit partial SADs stored in 'res'. res = _mm256_hadd_epi32(res, res); res = _mm256_hadd_epi32(res, res); int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); return (sad + 31) >> 6; }
void filterScanlinesSSE( unsigned char* filtered, unsigned char* image, unsigned int WIDTH, unsigned int HEIGHT ) { int blocks = 3*WIDTH/16; // Create move-mask for last block of each scanline __m128i mask = _mm_cmplt_epi8( _mm_set_epi8( 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ), _mm_set1_epi8( 3*WIDTH-16*blocks ) ); { const unsigned char* in = image; unsigned char* out = filtered; *out++ = 0; for(int b=0; b<blocks; b++ ) { _mm_storeu_si128( (__m128i*)out, _mm_lddqu_si128( (__m128i const*)in ) ); in += 16; out += 16; } _mm_maskmoveu_si128( _mm_lddqu_si128( (__m128i const*)in ), mask, (char*)out ); } for( unsigned int j=1; j<HEIGHT; j++ ) { const unsigned char* in = image + 3*WIDTH*(j-1); unsigned char* out = filtered + (3*WIDTH+1)*j; *out++ = 2; for(int b=0; b<blocks; b++ ) { __m128i _t0 = _mm_lddqu_si128( (__m128i const*)in ); __m128i _t1 = _mm_lddqu_si128( (__m128i const*)(in + 3*WIDTH ) ); _mm_storeu_si128( (__m128i*)out, _mm_sub_epi8( _t1, _t0 ) ); in += 16; out += 16; } _mm_maskmoveu_si128( _mm_lddqu_si128( (__m128i const*)in ), mask, (char*)out ); } }
void png_read_filter_row_sub3_sse(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_size_t i; png_bytep rp = row; __m128i racc = _mm_setzero_si128(); PNG_UNUSED(prev_row) __m128i nrb = _mm_load_si128((__m128i*)(rp)); for (i = 0; i < row_info->rowbytes; i += 15, rp += 15) { __m128i rb = nrb; #ifndef __SSSE3__ nrb = _mm_loadu_si128((__m128i*)(rp + 15)); racc = _mm_srli_si128(_mm_slli_si128(racc, 1), 13); racc = _mm_or_si128(racc, _mm_slli_si128(rb, 3)); #else nrb = _mm_lddqu_si128((__m128i*)(rp + 15)); racc = _mm_alignr_epi8(rb, _mm_slli_si128(racc, 1), 13); #endif rb = _mm_add_epi8(rb, racc); racc = _mm_slli_si128(racc, 3); rb = _mm_add_epi8(rb, racc); racc = _mm_slli_si128(racc, 3); rb = _mm_add_epi8(rb, racc); racc = _mm_slli_si128(racc, 3); rb = _mm_add_epi8(rb, racc); racc = _mm_slli_si128(racc, 3); rb = _mm_add_epi8(rb, racc); racc = rb; _mm_storeu_si128((__m128i*)rp, rb); } }
pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi) { int lastRow, lastCol; BYTE *UData,*VData,*YData; int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV; __m128i r0,r1,r2,r3,r4,r5,r6,r7; __m128i *buffer; /* last_line: if the last (U,V doubled) line should be skipped, set to 10B * last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */ buffer = _aligned_malloc(4 * 16, 16); YData = (BYTE*) pSrc[0]; UData = (BYTE*) pSrc[1]; VData = (BYTE*) pSrc[2]; nWidth = roi->width; nHeight = roi->height; if ((lastCol = (nWidth & 3))) { switch (lastCol) { case 1: r7 = _mm_set_epi32(0,0,0,0xFFFFFFFF); break; case 2: r7 = _mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF); break; case 3: r7 = _mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF); break; } _mm_store_si128(buffer+3,r7); lastCol = 1; } nWidth += 3; nWidth = nWidth >> 2; lastRow = nHeight & 1; nHeight++; nHeight = nHeight >> 1; VaddDst = (dstStep << 1) - (nWidth << 4); VaddY = (srcStep[0] << 1) - (nWidth << 2); VaddU = srcStep[1] - (((nWidth << 1) + 2) & 0xFFFC); VaddV = srcStep[2] - (((nWidth << 1) + 2) & 0xFFFC); while (nHeight-- > 0) { if (nHeight == 0) lastRow <<= 1; i = 0; do { if (!(i & 0x01)) { /* Y-, U- and V-data is stored in different arrays. * We start with processing U-data. * * at first we fetch four U-values from its array and shuffle them like this: * 0d0d 0c0c 0b0b 0a0a * we've done two things: converting the values to signed words and duplicating * each value, because always two pixel "share" the same U- (and V-) data */ r0 = _mm_cvtsi32_si128(*(UINT32 *)UData); r5 = _mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000); r0 = _mm_shuffle_epi8(r0,r5); UData += 4; /* then we subtract 128 from each value, so we get D */ r3 = _mm_set_epi16(128,128,128,128,128,128,128,128); r0 = _mm_subs_epi16(r0,r3); /* we need to do two things with our D, so let's store it for later use */ r2 = r0; /* now we can multiply our D with 48 and unpack it to xmm4:xmm0 * this is what we need to get G data later on */ r4 = r0; r7 = _mm_set_epi16(48,48,48,48,48,48,48,48); r0 = _mm_mullo_epi16(r0,r7); r4 = _mm_mulhi_epi16(r4,r7); r7 = r0; r0 = _mm_unpacklo_epi16(r0,r4); r4 = _mm_unpackhi_epi16(r7,r4); /* to get B data, we need to prepare a second value, D*475 */ r1 = r2; r7 = _mm_set_epi16(475,475,475,475,475,475,475,475); r1 = _mm_mullo_epi16(r1,r7); r2 = _mm_mulhi_epi16(r2,r7); r7 = r1; r1 = _mm_unpacklo_epi16(r1,r2); r7 = _mm_unpackhi_epi16(r7,r2); /* so we got something like this: xmm7:xmm1 * this pair contains values for 16 pixel: * aabbccdd * aabbccdd, but we can only work on four pixel at once, so we need to save upper values */ _mm_store_si128(buffer+1,r7); /* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */ r2 = _mm_cvtsi32_si128(*(UINT32 *)VData); r2 = _mm_shuffle_epi8(r2,r5); VData += 4; r2 = _mm_subs_epi16(r2,r3); r5 = r2; /* this is also known as E*403, we need it to convert R data */ r3 = r2; r7 = _mm_set_epi16(403,403,403,403,403,403,403,403); r2 = _mm_mullo_epi16(r2,r7); r3 = _mm_mulhi_epi16(r3,r7); r7 = r2; r2 = _mm_unpacklo_epi16(r2,r3); r7 = _mm_unpackhi_epi16(r7,r3); /* and preserve upper four values for future ... */ _mm_store_si128(buffer+2,r7); /* doing this step: E*120 */ r3 = r5; r7 = _mm_set_epi16(120,120,120,120,120,120,120,120); r3 = _mm_mullo_epi16(r3,r7); r5 = _mm_mulhi_epi16(r5,r7); r7 = r3; r3 = _mm_unpacklo_epi16(r3,r5); r7 = _mm_unpackhi_epi16(r7,r5); /* now we complete what we've begun above: * (48*D) + (120*E) = (48*D +120*E) */ r0 = _mm_add_epi32(r0,r3); r4 = _mm_add_epi32(r4,r7); /* and store to memory ! */ _mm_store_si128(buffer,r4); } else { /* maybe you've wondered about the conditional above ? * Well, we prepared UV data for eight pixel in each line, but can only process four * per loop. So we need to load the upper four pixel data from memory each secound loop! */ r1 = _mm_load_si128(buffer+1); r2 = _mm_load_si128(buffer+2); r0 = _mm_load_si128(buffer); } if (++i == nWidth) lastCol <<= 1; /* We didn't produce any output yet, so let's do so! * Ok, fetch four pixel from the Y-data array and shuffle them like this: * 00d0 00c0 00b0 00a0, to get signed dwords and multiply by 256 */ r4 = _mm_cvtsi32_si128(*(UINT32 *)YData); r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); r4 = _mm_shuffle_epi8(r4,r7); r5 = r4; r6 = r4; /* no we can perform the "real" conversion itself and produce output! */ r4 = _mm_add_epi32(r4,r2); r5 = _mm_sub_epi32(r5,r0); r6 = _mm_add_epi32(r6,r1); /* in the end, we only need bytes for RGB values. * So, what do we do? right! shifting left makes values bigger and thats always good. * before we had dwords of data, and by shifting left and treating the result * as packed words, we get not only signed words, but do also divide by 256 * imagine, data is now ordered this way: ddx0 ccx0 bbx0 aax0, and x is the least * significant byte, that we don't need anymore, because we've done some rounding */ r4 = _mm_slli_epi32(r4,8); r5 = _mm_slli_epi32(r5,8); r6 = _mm_slli_epi32(r6,8); /* one thing we still have to face is the clip() function ... * we have still signed words, and there are those min/max instructions in SSE2 ... * the max instruction takes always the bigger of the two operands and stores it in the first one, * and it operates with signs ! * if we feed it with our values and zeros, it takes the zeros if our values are smaller than * zero and otherwise our values */ r7 = _mm_set_epi32(0,0,0,0); r4 = _mm_max_epi16(r4,r7); r5 = _mm_max_epi16(r5,r7); r6 = _mm_max_epi16(r6,r7); /* the same thing just completely different can be used to limit our values to 255, * but now using the min instruction and 255s */ r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_min_epi16(r4,r7); r5 = _mm_min_epi16(r5,r7); r6 = _mm_min_epi16(r6,r7); /* Now we got our bytes. * the moment has come to assemble the three channels R,G and B to the xrgb dwords * on Red channel we just have to and each futural dword with 00FF0000H */ //r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_and_si128(r4,r7); /* on Green channel we have to shuffle somehow, so we get something like this: * 00d0 00c0 00b0 00a0 */ r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); r5 = _mm_shuffle_epi8(r5,r7); /* and on Blue channel that one: * 000d 000c 000b 000a */ r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); r6 = _mm_shuffle_epi8(r6,r7); /* and at last we or it together and get this one: * xrgb xrgb xrgb xrgb */ r4 = _mm_or_si128(r4,r5); r4 = _mm_or_si128(r4,r6); /* Only thing to do know is writing data to memory, but this gets a bit more * complicated if the width is not a multiple of four and it is the last column in line. */ if (lastCol & 0x02) { /* let's say, we need to only convert six pixel in width * Ok, the first 4 pixel will be converted just like every 4 pixel else, but * if it's the last loop in line, last_column is shifted left by one (curious? have a look above), * and we land here. Through initialisation a mask was prepared. In this case it looks like * 0000FFFFH 0000FFFFH 0000FFFFH 0000FFFFH */ r6 = _mm_load_si128(buffer+3); /* we and our output data with this mask to get only the valid pixel */ r4 = _mm_and_si128(r4,r6); /* then we fetch memory from the destination array ... */ r5 = _mm_lddqu_si128((__m128i *)pDst); /* ... and and it with the inverse mask. We get only those pixel, which should not be updated */ r6 = _mm_andnot_si128(r6,r5); /* we only have to or the two values together and write it back to the destination array, * and only the pixel that should be updated really get changed. */ r4 = _mm_or_si128(r4,r6); } _mm_storeu_si128((__m128i *)pDst,r4); if (!(lastRow & 0x02)) { /* Because UV data is the same for two lines, we can process the secound line just here, * in the same loop. Only thing we need to do is to add some offsets to the Y- and destination * pointer. These offsets are iStride[0] and the target scanline. * But if we don't need to process the secound line, like if we are in the last line of processing nine lines, * we just skip all this. */ r4 = _mm_cvtsi32_si128(*(UINT32 *)(YData+srcStep[0])); r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); r4 = _mm_shuffle_epi8(r4,r7); r5 = r4; r6 = r4; r4 = _mm_add_epi32(r4,r2); r5 = _mm_sub_epi32(r5,r0); r6 = _mm_add_epi32(r6,r1); r4 = _mm_slli_epi32(r4,8); r5 = _mm_slli_epi32(r5,8); r6 = _mm_slli_epi32(r6,8); r7 = _mm_set_epi32(0,0,0,0); r4 = _mm_max_epi16(r4,r7); r5 = _mm_max_epi16(r5,r7); r6 = _mm_max_epi16(r6,r7); r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_min_epi16(r4,r7); r5 = _mm_min_epi16(r5,r7); r6 = _mm_min_epi16(r6,r7); r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_and_si128(r4,r7); r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); r5 = _mm_shuffle_epi8(r5,r7); r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); r6 = _mm_shuffle_epi8(r6,r7); r4 = _mm_or_si128(r4,r5); r4 = _mm_or_si128(r4,r6); if (lastCol & 0x02) { r6 = _mm_load_si128(buffer+3); r4 = _mm_and_si128(r4,r6); r5 = _mm_lddqu_si128((__m128i *)(pDst+dstStep)); r6 = _mm_andnot_si128(r6,r5); r4 = _mm_or_si128(r4,r6); /* only thing is, we should shift [rbp-42] back here, because we have processed the last column, * and this "special condition" can be released */ lastCol >>= 1; } _mm_storeu_si128((__m128i *)(pDst+dstStep),r4); } /* after all we have to increase the destination- and Y-data pointer by four pixel */ pDst += 16; YData += 4; }
void png_read_filter_row_avg3_sse(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_size_t i; png_bytep rp = row; png_const_bytep prp = prev_row; __m128i nrb = _mm_load_si128((__m128i*)(rp)); __m128i pixel = _mm_setzero_si128(); const __m128i mask = _mm_set1_epi8(0x01); for (i = 0; i < row_info->rowbytes; i += 15, rp += 15, prp += 15) { #ifndef __SSSE3__ __m128i prb = _mm_loadu_si128((__m128i*)prp); #else __m128i prb = _mm_lddqu_si128((__m128i*)prp); #endif __m128i rb = nrb; // First pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); prb = _mm_srli_si128(prb, 3); #ifndef __SSSE3__ rb = _mm_srli_si128(rb, 3); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13)); #else rb = _mm_alignr_epi8(pixel, rb, 3); #endif // Second pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); prb = _mm_srli_si128(prb, 3); #ifndef __SSSE3__ rb = _mm_srli_si128(rb, 3); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13)); #else rb = _mm_alignr_epi8(pixel, rb, 3); #endif // Third pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); prb = _mm_srli_si128(prb, 3); #ifndef __SSSE3__ rb = _mm_srli_si128(rb, 3); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13)); #else rb = _mm_alignr_epi8(pixel, rb, 3); #endif // Fourth pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); prb = _mm_srli_si128(prb, 3); #ifndef __SSSE3__ rb = _mm_srli_si128(rb, 3); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13)); #else rb = _mm_alignr_epi8(pixel, rb, 3); #endif // Fifth pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); #ifndef __SSSE3__ nrb = _mm_loadu_si128((__m128i*)(rp + 15)); rb = _mm_srli_si128(rb, 3); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13)); #else nrb = _mm_lddqu_si128((__m128i*)(rp + 15)); rb = _mm_alignr_epi8(pixel, rb, 3); #endif rb = _mm_srli_si128(rb, 1); _mm_storeu_si128((__m128i*)rp, rb); } }
int check(size_t N, size_t Nq) { int * queries = (int*)malloc(Nq*sizeof(int)); int * source = (int*)malloc(N*sizeof(int)); size_t i, k; int displaytest = 0; for(i = 0; i < N; ++i) { source[i] = rand(); } qsort (source, N, sizeof(int), compare); if(displaytest) { for(i = 0; i < N; ++i) { printf(" %d ",source[i]); } printf("\n"); } int maxval = source[N-1]; for(i = 0; i < Nq; ++i) { queries[i] = rand()%(maxval+1); } for(k = 0; k < Nq; ++k) if(branchy_search(source,N,queries[k]) != branchfree_search(source,N,queries[k])) { printf("bug1\n"); free(source); free(queries); return -1; } for(k = 0; k+1 < Nq; k+=2) { size_t i1, i2; branchfree_search2(source,N,queries[k],queries[k+1],&i1,&i2); if(branchfree_search(source,N,queries[k]) != i1) { printf("bug2\n"); free(source); free(queries); return -1; } if(branchfree_search(source,N,queries[k+1]) != i2) { printf("bug3\n"); free(source); free(queries); return -1; } } #ifdef MYAVX for(k = 0; k+3 < Nq; k+=4) { size_t i1, i2, i3, i4; __m128i q = _mm_lddqu_si128((__m128i const*)(queries +k)); __m128i bog = branchfree_search4_avx(source,N,q); branchfree_search4(source,N,queries[k],queries[k+1],queries[k+2],queries[k+3],&i1,&i2,&i3,&i4); if((_mm_extract_epi32(bog,0)!= i1) || (_mm_extract_epi32(bog,1)!= i2) || (_mm_extract_epi32(bog,2)!= i3) || (_mm_extract_epi32(bog,3)!= i4)) { printf("bug3\n"); printf("%zu %zu %zu %zu\n",i1,i2,i3,i4); printf("%d %d %d %d\n",_mm_extract_epi32(bog,0),_mm_extract_epi32(bog,1),_mm_extract_epi32(bog,2),_mm_extract_epi32(bog,3)); return -1; } } #endif free(source); free(queries); return 0; }
int demo(size_t N, size_t Nq) { int * queries = (int*)malloc(Nq*sizeof(int)); int * source = (int*)malloc(N*sizeof(int)); size_t bogus = 0; size_t bogus1 = 0; size_t bogus2 = 0; size_t bogus3 = 0; size_t bogus4 = 0; __m128i bog = _mm_setzero_si128(); size_t i, k, ti; printf("===============\n"); printf("array size (N)=%zu, number of queries (Nq)=%zu...\n",N,Nq); printf("preparing data...\n"); for(i = 0; i < N; ++i) { source[i] = rand(); } qsort (source, N, sizeof(int), compare); int maxval = source[N-1]; for(i = 0; i < Nq; ++i) { queries[i] = rand()%(maxval+1); } printf("beginning tests...\n"); printf("\n"); for(ti = 0; ti < 3; ++ti) { struct timeval t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13; gettimeofday(&t6, 0); for(k = 0; k+1 < Nq; k+=2) branchfree_search2_prefetch(source,N,queries[k],queries[k+1],&bogus1,&bogus2); gettimeofday(&t1, 0); for(k = 0; k < Nq; ++k) bogus += branchfree_search(source,N,queries[k]); gettimeofday(&t2, 0); for(k = 0; k < Nq; ++k) bogus += branchy_search(source,N,queries[k]); gettimeofday(&t3, 0); for(k = 0; k < Nq; ++k) bogus += branchfree_search_prefetch(source,N,queries[k]); gettimeofday(&t4, 0); for(k = 0; k+1 < Nq; k+=2) branchfree_search2(source,N,queries[k],queries[k+1],&bogus1,&bogus2); gettimeofday(&t5, 0); for(k = 0; k+3 < Nq; k+=4) branchfree_search4(source,N,queries[k],queries[k+1],queries[k+2],queries[k+3],&bogus1,&bogus2,&bogus3,&bogus4); gettimeofday(&t7, 0); for(k = 0; k+3 < Nq; k+=4) branchfree_search4_prefetch(source,N,queries[k],queries[k+1],queries[k+2],queries[k+3],&bogus1,&bogus2,&bogus3,&bogus4); gettimeofday(&t8, 0); #ifdef MYAVX for(k = 0; k+3 < Nq; k+=4) { __m128i q = _mm_lddqu_si128((__m128i const*)(queries +k)); bog = _mm_add_epi32(bog,branchfree_search4_avx(source,N,q)); } gettimeofday(&t9, 0); for(k = 0; k+7 < Nq; k+=8) { __m256i q = _mm256_lddqu_si256((__m256i const*)(queries +k)); bog = _mm_add_epi32(bog,_mm256_castsi256_si128(branchfree_search8_avx(source,N,q))); } #endif gettimeofday(&t10, 0); for(k = 0; k+7 < Nq; k+=8) { branchfree_search8(source,N,queries[k],queries[k+1],queries[k+2],queries[k+3],queries[k+4],queries[k+5],queries[k+6],queries[k+7],&bogus1,&bogus2,&bogus3,&bogus4,&bogus1,&bogus2,&bogus3,&bogus4); } gettimeofday(&t11, 0); for(k = 0; k+7 < Nq; k+=8) { branchfree_search8_prefetch(source,N,queries[k],queries[k+1],queries[k+2],queries[k+3],queries[k+4],queries[k+5],queries[k+6],queries[k+7],&bogus1,&bogus2,&bogus3,&bogus4,&bogus1,&bogus2,&bogus3,&bogus4); } gettimeofday(&t12, 0); for(k = 0; k < Nq; ++k) bogus += hackedbranchfree_search(source,N,queries[k]); gettimeofday(&t13, 0); printf("branchless time=%llu \n",t2.tv_sec * 1000ULL * 1000ULL + t2.tv_usec - (t1.tv_sec * 1000ULL * 1000ULL + t1.tv_usec)); printf("branchy time=%llu \n",t3.tv_sec * 1000ULL * 1000ULL + t3.tv_usec - (t2.tv_sec * 1000ULL * 1000ULL + t2.tv_usec)); printf("branchless time with prefetch=%llu \n",t4.tv_sec * 1000ULL * 1000ULL + t4.tv_usec - (t3.tv_sec * 1000ULL * 1000ULL + t3.tv_usec)); printf("branchless interleaved (2) time=%llu \n",t5.tv_sec * 1000ULL * 1000ULL + t5.tv_usec - (t4.tv_sec * 1000ULL * 1000ULL + t4.tv_usec)); printf("branchless interleaved (2) (prefetch) time=%llu \n",t1.tv_sec * 1000ULL * 1000ULL + t1.tv_usec - (t6.tv_sec * 1000ULL * 1000ULL + t6.tv_usec)); printf("branchless interleaved (4) time=%llu \n",t7.tv_sec * 1000ULL * 1000ULL + t7.tv_usec - (t5.tv_sec * 1000ULL * 1000ULL + t5.tv_usec)); printf("branchless interleaved (4) (prefetch) time=%llu \n",t8.tv_sec * 1000ULL * 1000ULL + t8.tv_usec - (t7.tv_sec * 1000ULL * 1000ULL + t7.tv_usec)); #ifdef MYAVX printf("branchless interleaved (4) (AVX) time=%llu \n",t9.tv_sec * 1000ULL * 1000ULL + t9.tv_usec - (t8.tv_sec * 1000ULL * 1000ULL + t8.tv_usec)); printf("branchless interleaved (8) (AVX) time=%llu \n",t10.tv_sec * 1000ULL * 1000ULL + t10.tv_usec - (t9.tv_sec * 1000ULL * 1000ULL + t9.tv_usec)); #endif printf("branchless interleaved (8) time=%llu \n",t11.tv_sec * 1000ULL * 1000ULL + t11.tv_usec - (t10.tv_sec * 1000ULL * 1000ULL + t10.tv_usec)); printf("branchless interleaved (8) (prefetch) time=%llu \n",t12.tv_sec * 1000ULL * 1000ULL + t12.tv_usec - (t11.tv_sec * 1000ULL * 1000ULL + t11.tv_usec)); printf("hacked branchless time=%llu \n",t13.tv_sec * 1000ULL * 1000ULL + t13.tv_usec - (t12.tv_sec * 1000ULL * 1000ULL + t12.tv_usec)); printf("\n"); } #ifdef MYAVX bogus += _mm_extract_epi32(bog,0); #endif free(source); free(queries); return (int) bogus+bogus1+bogus2+bogus3+bogus4; }
__m128i test_mm_lddqu_si128(__m128i const* P) { // CHECK-LABEL: test_mm_lddqu_si128 // CHECK: call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %{{.*}}) return _mm_lddqu_si128(P); }
/* ------------------------------------------------------------------------- */ pstatus_t ssse3_sign_16s( const INT16 *pSrc, INT16 *pDst, INT32 len) { const INT16 *sptr = (const INT16 *) pSrc; INT16 *dptr = (INT16 *) pDst; size_t count; if (len < 16) { return general_sign_16s(pSrc, pDst, len); } /* Check for 16-byte alignment (eventually). */ if ((ULONG_PTR) pDst & 0x01) { return general_sign_16s(pSrc, pDst, len); } /* Seek 16-byte alignment. */ while ((ULONG_PTR) dptr & 0x0f) { INT16 src = *sptr++; *dptr++ = (src < 0) ? (-1) : ((src > 0) ? 1 : 0); if (--len == 0) return PRIMITIVES_SUCCESS; } /* Do 32-short chunks using 8 XMM registers. */ count = len >> 5; /* / 32 */ len -= count << 5; /* * 32 */ if ((ULONG_PTR) sptr & 0x0f) { /* Unaligned */ while (count--) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; xmm0 = _mm_set1_epi16(0x0001U); xmm1 = _mm_set1_epi16(0x0001U); xmm2 = _mm_set1_epi16(0x0001U); xmm3 = _mm_set1_epi16(0x0001U); xmm4 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; xmm5 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; xmm6 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; xmm7 = _mm_lddqu_si128((__m128i *) sptr); sptr += 8; xmm0 = _mm_sign_epi16(xmm0, xmm4); xmm1 = _mm_sign_epi16(xmm1, xmm5); xmm2 = _mm_sign_epi16(xmm2, xmm6); xmm3 = _mm_sign_epi16(xmm3, xmm7); _mm_store_si128((__m128i *) dptr, xmm0); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm1); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm2); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm3); dptr += 8; } } else { /* Aligned */ while (count--) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; xmm0 = _mm_set1_epi16(0x0001U); xmm1 = _mm_set1_epi16(0x0001U); xmm2 = _mm_set1_epi16(0x0001U); xmm3 = _mm_set1_epi16(0x0001U); xmm4 = _mm_load_si128((__m128i *) sptr); sptr += 8; xmm5 = _mm_load_si128((__m128i *) sptr); sptr += 8; xmm6 = _mm_load_si128((__m128i *) sptr); sptr += 8; xmm7 = _mm_load_si128((__m128i *) sptr); sptr += 8; xmm0 = _mm_sign_epi16(xmm0, xmm4); xmm1 = _mm_sign_epi16(xmm1, xmm5); xmm2 = _mm_sign_epi16(xmm2, xmm6); xmm3 = _mm_sign_epi16(xmm3, xmm7); _mm_store_si128((__m128i *) dptr, xmm0); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm1); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm2); dptr += 8; _mm_store_si128((__m128i *) dptr, xmm3); dptr += 8; } } /* Do 8-short chunks using two XMM registers. */ count = len >> 3; len -= count << 3; while (count--) { __m128i xmm0 = _mm_set1_epi16(0x0001U); __m128i xmm1 = LOAD_SI128(sptr); sptr += 8; xmm0 = _mm_sign_epi16(xmm0, xmm1); _mm_store_si128((__m128i *) dptr, xmm0); dptr += 8; } /* Do leftovers. */ while (len--) { INT16 src = *sptr++; *dptr++ = (src < 0) ? -1 : ((src > 0) ? 1 : 0); } return PRIMITIVES_SUCCESS; }
0x0,0x1,0x2,0x3,0x8,0x9,0xa,0xb,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0x8,0x9,0xa,0xb,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0x0,0x1,0x2,0x3,0x4,0x5,0x6,0x7,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0x4,0x5,0x6,0x7,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0x0,0x1,0x2,0x3,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF }; // write vector new, while omitting repeated values assuming that previously written vector was "old" static int store_unique(__m128i old,__m128i new, uint32_t * output) { __m128i vecTmp = _mm_alignr_epi8(new, old, 16-4); int M = _mm_movemask_epi8(_mm_cmpeq_epi32(vecTmp,new));//_pdep_u32(,0x1111); M=_pext_u32(M,0x1111); int numberofnewvalues = 4 - _mm_popcnt_u32(M); __m128i key = _mm_lddqu_si128((const __m128i* )uniqshuf + M); __m128i val = _mm_shuffle_epi8(new,key); _mm_storeu_si128((__m128i* )output,val); return numberofnewvalues; } // working in-place, this function overwrites the repeated values static uint32_t unique(uint32_t * out, uint32_t len) { uint32_t pos = 1; for(uint32_t i = 1; i < len; ++i) { if(out[i] != out[i-1]) { out[pos++] = out[i]; } } return pos; }
double bst_compute_121_m128_aligned4( void*_bst_obj, double* p, double* q, size_t nn ) { segments_t* mem = (segments_t*) _bst_obj; int n, i, r, l_end, l_end_pre, j; double t, e_tmp; double* e = mem->e, *w = mem->w; int* root = mem->r; __m128d v_tmp; __m128d v00, v01, v02, v03; __m128d v10, v11, v12, v13; __m128i v_cur_roots, v_old_roots, v_new_roots; __m128 v_rootmask; // initialization // mem->n = nn; n = nn; // subtractions with n potentially negative. say hello to all the bugs int idx1, idx2, idx3, pad, pad_r; idx1 = (n+1)*(n+2)/2 + n/2; e[idx1] = q[n]; idx1++; pad = 1; // pad contains the padding for row i+1 // for row n it's always 1 for (i = n-1; i >= 0; --i) { idx1 -= 2*(n-i)+1 + pad; idx2 = idx1 + 1; e[idx1] = q[i]; w[idx1] = q[i]; for (j = i+1; j < n+1; ++j,++idx2) { e[idx2] = INFINITY; w[idx2] = w[idx2-1] + p[j-1] + q[j]; } // idx2 now points to the beginning of the next line. idx2 += pad; // padding of line i+1 idx3 = idx1; pad_r = pad; // padding of line r for (r = i; r < n; ++r) { pad_r = !pad_r; // padding of line r+1 // idx2 = IDX(r+1, r+1); idx1 = idx3; l_end = idx2 + (n-r); e_tmp = e[idx1++]; // calculate until a multiple of 8 doubles is left // 8 = 4 * 2 128-bit vectors l_end_pre = idx2 + ((n-r)&3); for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) { t = e_tmp + e[idx2] + w[idx1]; if (t < e[idx1]) { e[idx1] = t; root[idx1] = r; } idx1++; } v_tmp = _mm_set_pd( e_tmp, e_tmp ); // execute the shit for 4 vectors of size 2 v_cur_roots = _mm_set_epi32(r, r, r, r); for( ; idx2 < l_end; idx2 += 4 ) { v01 = _mm_load_pd( &w[idx1 ] ); v11 = _mm_load_pd( &w[idx1+2] ); v00 = _mm_load_pd( &e[idx2 ] ); v01 = _mm_add_pd( v01, v_tmp ); // supoptimal for raw-dependency v10 = _mm_load_pd( &e[idx2+2] ); v11 = _mm_add_pd( v11, v_tmp ); v01 = _mm_add_pd( v01, v00 ); v03 = _mm_load_pd( &e[idx1 ] ); v11 = _mm_add_pd( v11, v10 ); v13 = _mm_load_pd( &e[idx1+2] ); v02 = _mm_cmplt_pd( v01, v03 ); v12 = _mm_cmplt_pd( v11, v13 ); v00 = _mm_or_pd( _mm_and_pd( v02, v01 ), _mm_andnot_pd( v02, v03 )); v10 = _mm_or_pd( _mm_and_pd( v12, v11 ), _mm_andnot_pd( v12, v13 )); _mm_store_pd( &e[idx1 ], v00 ); _mm_store_pd( &e[idx1+2], v10 ); v_rootmask = _mm_shuffle_ps( _mm_castpd_ps( v02 ), _mm_castpd_ps( v12 ), _MM_SHUFFLE(0,2,0,2) ); v_old_roots = _mm_lddqu_si128( &root[idx1] ); v_new_roots = _mm_or_si128( _mm_and_si128( v_cur_roots, _mm_castps_si128( v_rootmask ) ), _mm_andnot_si128( v_old_roots, _mm_castps_si128( v_rootmask ) ) ); _mm_storeu_si128( &root[idx1], v_new_roots ); idx1 += 4; } idx2 += pad_r; idx3++; } pad = !pad; // every other line as padding 0, or 1, respectively } // if n is even, the total number of entries in the first // row of the table is odd, so we need padding return e[n + !(n&1)]; }
void PPUThread::cpu_task() { //SetHostRoundingMode(FPSCR_RN_NEAR); if (custom_task) { if (check_status()) return; return custom_task(*this); } g_tls_log_prefix = [] { const auto cpu = static_cast<PPUThread*>(get_current_cpu_thread()); return fmt::format("%s [0x%08x]", cpu->get_name(), cpu->pc); }; const auto base = vm::_ptr<const u8>(0); // Select opcode table const auto& table = *( g_cfg_ppu_decoder.get() == ppu_decoder_type::precise ? &s_ppu_interpreter_precise.get_table() : g_cfg_ppu_decoder.get() == ppu_decoder_type::fast ? &s_ppu_interpreter_fast.get_table() : throw std::logic_error("Invalid PPU decoder")); v128 _op; decltype(&ppu_interpreter::UNK) func0, func1, func2, func3; while (true) { if (UNLIKELY(state.load())) { if (check_status()) return; } // Reinitialize { const auto _ops = _mm_shuffle_epi8(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(base + pc)), _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)); _op.vi = _ops; const v128 _i = v128::fromV(_mm_and_si128(_mm_or_si128(_mm_slli_epi32(_op.vi, 6), _mm_srli_epi32(_op.vi, 26)), _mm_set1_epi32(0x1ffff))); func0 = table[_i._u32[0]]; func1 = table[_i._u32[1]]; func2 = table[_i._u32[2]]; func3 = table[_i._u32[3]]; } while (LIKELY(func0(*this, { _op._u32[0] }))) { if (pc += 4, LIKELY(func1(*this, { _op._u32[1] }))) { if (pc += 4, LIKELY(func2(*this, { _op._u32[2] }))) { pc += 4; func0 = func3; const auto _ops = _mm_shuffle_epi8(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(base + pc + 4)), _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)); _op.vi = _mm_alignr_epi8(_ops, _op.vi, 12); const v128 _i = v128::fromV(_mm_and_si128(_mm_or_si128(_mm_slli_epi32(_op.vi, 6), _mm_srli_epi32(_op.vi, 26)), _mm_set1_epi32(0x1ffff))); func1 = table[_i._u32[1]]; func2 = table[_i._u32[2]]; func3 = table[_i._u32[3]]; if (UNLIKELY(state.load())) { break; } continue; } break; } break; } } }
QT_BEGIN_NAMESPACE // Convert a scanline of RGB888 (src) to RGB32 (dst) // src must be at least len * 3 bytes // dst must be at least len * 4 bytes Q_GUI_EXPORT void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 *dst, const uchar *src, int len) { quint32 *const end = dst + len; // Prologue, align dst to 16 bytes. The alignment is done on dst because it has 4 store() // for each 3 load() of src. const int offsetToAlignOn16Bytes = (4 - ((reinterpret_cast<quintptr>(dst) >> 2) & 0x3)) & 0x3; const int prologLength = qMin(len, offsetToAlignOn16Bytes); for (int i = 0; i < prologLength; ++i) { *dst++ = qRgb(src[0], src[1], src[2]); src += 3; } // Mask the 4 first colors of the RGB888 vector const __m128i shuffleMask = _mm_set_epi8(char(0xff), 9, 10, 11, char(0xff), 6, 7, 8, char(0xff), 3, 4, 5, char(0xff), 0, 1, 2); // Mask the 4 last colors of a RGB888 vector with an offset of 1 (so the last 3 bytes are RGB) const __m128i shuffleMaskEnd = _mm_set_epi8(char(0xff), 13, 14, 15, char(0xff), 10, 11, 12, char(0xff), 7, 8, 9, char(0xff), 4, 5, 6); // Mask to have alpha = 0xff const __m128i alphaMask = _mm_set1_epi32(0xff000000); __m128i *inVectorPtr = (__m128i *)src; __m128i *dstVectorPtr = (__m128i *)dst; const int simdRoundCount = (len - prologLength) / 16; // one iteration in the loop converts 16 pixels for (int i = 0; i < simdRoundCount; ++i) { /* RGB888 has 5 pixels per vector, + 1 byte from the next pixel. The idea here is to load vectors of RGB888 and use palignr to select a vector out of two vectors. After 3 loads of RGB888 and 3 stores of RGB32, we have 4 pixels left in the last vector of RGB888, we can mask it directly to get a last store or RGB32. After that, the first next byte is a R, and we can loop for the next 16 pixels. The conversion itself is done with a byte permutation (pshufb). */ __m128i firstSrcVector = _mm_lddqu_si128(inVectorPtr); __m128i outputVector = _mm_shuffle_epi8(firstSrcVector, shuffleMask); _mm_store_si128(dstVectorPtr, _mm_or_si128(outputVector, alphaMask)); ++inVectorPtr; ++dstVectorPtr; // There are 4 unused bytes left in srcVector, we need to load the next 16 bytes // and load the next input with palignr __m128i secondSrcVector = _mm_lddqu_si128(inVectorPtr); __m128i srcVector = _mm_alignr_epi8(secondSrcVector, firstSrcVector, 12); outputVector = _mm_shuffle_epi8(srcVector, shuffleMask); _mm_store_si128(dstVectorPtr, _mm_or_si128(outputVector, alphaMask)); ++inVectorPtr; ++dstVectorPtr; firstSrcVector = secondSrcVector; // We now have 8 unused bytes left in firstSrcVector secondSrcVector = _mm_lddqu_si128(inVectorPtr); srcVector = _mm_alignr_epi8(secondSrcVector, firstSrcVector, 8); outputVector = _mm_shuffle_epi8(srcVector, shuffleMask); _mm_store_si128(dstVectorPtr, _mm_or_si128(outputVector, alphaMask)); ++inVectorPtr; ++dstVectorPtr; // There are now 12 unused bytes in firstSrcVector. // We can mask them directly, almost there. outputVector = _mm_shuffle_epi8(secondSrcVector, shuffleMaskEnd); _mm_store_si128(dstVectorPtr, _mm_or_si128(outputVector, alphaMask)); ++dstVectorPtr; } src = (uchar *)inVectorPtr; dst = (quint32 *)dstVectorPtr; while (dst != end) { *dst++ = qRgb(src[0], src[1], src[2]); src += 3; } }
static inline void calc_lbp_16_strip(IplImage * src, IplImage * dst, unsigned base) { const signed char* src_data = (signed char*)(src->imageData + base); unsigned char * dst_data = (unsigned char*)(dst->imageData + base); const signed char* const src_end = (signed char*)src->imageData + (src->height-1) * src->widthStep; __m128i pixels[3]; // Load first two rows //pixels[0] = *(__m128i*)src_data;//_mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data)); pixels[0] = _mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data)); //pixels[0] = _mm_xor_si128(pixels[0], sign_bit.q); // conversion from unsigned to signed - invert sign bit src_data += src->widthStep; //pixels[1] = *(__m128i*)src_data;//_mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data)); pixels[1] = _mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data)); //pixels[1] = _mm_xor_si128(pixels[1], sign_bit.q); src_data += src->widthStep; int phase = 2; __m128i * phase_map[3][3] = { {pixels+1, pixels+2, pixels}, {pixels+2, pixels, pixels+1}, {pixels, pixels+1, pixels+2}, }; while (src_data < src_end) { register __m128i weight = ones.q; register __m128i code = _mm_setzero_si128(); //pixels[phase] = _mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data)); //pixels[phase] = _mm_xor_si128(pixels[phase], sign_bit.q); //pixels[phase] = _mm_xor_si128(_mm_lddqu_si128((__m128i*)src_data), sign_bit.q); pixels[phase] = _mm_lddqu_si128((__m128i*)src_data); src_data += src->widthStep; dst_data += dst->widthStep; _mm_prefetch(src_data, _MM_HINT_T0); register __m128i a = *(phase_map[phase][0]); register __m128i b = *(phase_map[phase][1]); register __m128i c = *(phase_map[phase][2]); phase++; phase = (phase == 3) ? 0 : phase; // X . . A // . o . B // . . . C code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_slli_si128(a, 1)), weight)); weight = _mm_slli_epi64(weight, 1); // . X . // . . // . . . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, a), weight)); weight = _mm_slli_epi64(weight, 1); // . . X // . . // . . . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_srli_si128(a, 1)), weight)); weight = _mm_slli_epi64(weight, 1); // . . . // . X // . . . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_srli_si128(b, 1)), weight)); weight = _mm_slli_epi64(weight, 1); // . . . // . . // . . X code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_srli_si128(c, 1)), weight)); weight = _mm_slli_epi64(weight, 1); // . . . // . . // . X . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, c), weight)); weight = _mm_slli_epi64(weight, 1); // . . . // . . // X . . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_slli_si128(c, 1)), weight)); weight = _mm_slli_epi64(weight, 1); // . . . // X . // . . . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_slli_si128(b, 1)), weight)); _mm_maskmoveu_si128(code, lbp_valid_mask.q, (char*)dst_data); // store the results - unaligned write } }
static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) { __m128i a0 = _mm_lddqu_si128((const __m128i *)(lo)); __m128i a1 = _mm_lddqu_si128((const __m128i *)(hi)); __m256i a = _mm256_castsi128_si256(a0); return _mm256_inserti128_si256(a, a1, 1); }