void __ext_v_andnot(unsigned char *output, int outlen, unsigned char *input1, int inlen1, unsigned char *input2, int inlen2) { int cnt = 0; int bytelen1 = inlen1 / 8 + ((inlen1 % 8) > 0); while (cnt + 16 <= bytelen1) { __m128i mi1 = _mm_loadu_si128((__m128i *) (input1 + cnt)); __m128i mi2 = _mm_loadu_si128((__m128i *) (input2 + cnt)); _mm_storeu_si128((__m128i *) (output + cnt), _mm_andnot_si128(mi1, mi2)); cnt += 16; } while (cnt < bytelen1) { output[cnt] = (~input1[cnt]) & input2[cnt]; cnt++; } outlen = inlen1; }
static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]); const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); __m128i pa, pb; GetSumAbsDiff32_SSE2(&T, &TL, &pa); // pa = sum |T-TL| GetSumAbsDiff32_SSE2(&L, &TL, &pb); // pb = sum |L-TL| { const __m128i mask = _mm_cmpgt_epi32(pb, pa); const __m128i A = _mm_and_si128(mask, L); const __m128i B = _mm_andnot_si128(mask, T); const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T const __m128i res = _mm_sub_epi8(src, pred); _mm_storeu_si128((__m128i*)&out[i], res); } } if (i != num_pixels) { VP8LPredictorsSub_C[11](in + i, upper + i, num_pixels - i, out + i); } }
static inline __m128i SkMin32_SSE2(const __m128i& a, const __m128i& b) { __m128i cmp = _mm_cmplt_epi32(a, b); return _mm_or_si128(_mm_and_si128(cmp, a), _mm_andnot_si128(cmp, b)); }
static void s1 ( KTYPE a1, KTYPE a2, KTYPE a3, KTYPE a4, KTYPE a5, KTYPE a6, KTYPE *out1, KTYPE *out2, KTYPE *out3, KTYPE *out4 ) { aligned register KTYPE x1, x2, x3, x4, x5, x6, x7, x8; aligned register KTYPE x9, x10, x11, x12, x13, x14, x15, x16; aligned register KTYPE x17, x18, x19, x20, x21, x22, x23, x24; aligned register KTYPE x25, x26, x27, x28, x29, x30, x31, x32; aligned register KTYPE x33, x34, x35, x36, x37, x38, x39, x40; aligned register KTYPE x41, x42, x43, x44, x45, x46, x47, x48; aligned register KTYPE x49, x50, x51, x52, x53, x54, x55, x56; aligned register KTYPE x57, x58, x59, x60, x61, x62, x63; x1 = _mm_andnot_si128(a4, KCONST_1); x2 = _mm_andnot_si128(a1, KCONST_1); x3 = a4 ^ a3; x4 = x3 ^ x2; x5 = a3 | x2; x6 = x5 & x1; x7 = a6 | x6; x8 = x4 ^ x7; x9 = x1 | x2; x10 = a6 & x9; x11 = x7 ^ x10; x12 = a2 | x11; x13 = x8 ^ x12; x14 = x9 ^ x13; x15 = a6 | x14; x16 = x1 ^ x15; x17 = _mm_andnot_si128(x14, KCONST_1); x18 = x17 & x3; x19 = a2 | x18; x20 = x16 ^ x19; x21 = a5 | x20; x22 = x13 ^ x21; *out4 ^= x22; x23 = a3 | x4; x24 = _mm_andnot_si128(x23, KCONST_1); x25 = a6 | x24; x26 = x6 ^ x25; x27 = x1 & x8; x28 = a2 | x27; x29 = x26 ^ x28; x30 = x1 | x8; x31 = x30 ^ x6; x32 = x5 & x14; x33 = x32 ^ x8; x34 = a2 & x33; x35 = x31 ^ x34; x36 = a5 | x35; x37 = x29 ^ x36; *out1 ^= x37; x38 = a3 & x10; x39 = x38 | x4; x40 = a3 & x33; x41 = x40 ^ x25; x42 = a2 | x41; x43 = x39 ^ x42; x44 = a3 | x26; x45 = x44 ^ x14; x46 = a1 | x8; x47 = x46 ^ x20; x48 = a2 | x47; x49 = x45 ^ x48; x50 = a5 & x49; x51 = x43 ^ x50; *out2 ^= x51; x52 = x8 ^ x40; x53 = a3 ^ x11; x54 = x53 & x5; x55 = a2 | x54; x56 = x52 ^ x55; x57 = a6 | x4; x58 = x57 ^ x38; x59 = x13 & x56; x60 = a2 & x59; x61 = x58 ^ x60; x62 = a5 & x61; x63 = x56 ^ x62; *out3 ^= x63; }
static inline __m128i softlight_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { __m128i tmp1, tmp2, tmp3; // int m = da ? dc * 256 / da : 0; __m128i cmp = _mm_cmpeq_epi32(da, _mm_setzero_si128()); __m128i m = _mm_slli_epi32(dc, 8); __m128 x = _mm_cvtepi32_ps(m); __m128 y = _mm_cvtepi32_ps(da); m = _mm_cvttps_epi32(_mm_div_ps(x, y)); m = _mm_andnot_si128(cmp, m); // if (2 * sc <= sa) tmp1 = _mm_slli_epi32(sc, 1); // 2 * sc __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa); tmp1 = _mm_sub_epi32(tmp1, sa); // 2 * sc - sa tmp2 = _mm_sub_epi32(_mm_set1_epi32(256), m); // 256 - m tmp1 = Multiply32_SSE2(tmp1, tmp2); tmp1 = _mm_srai_epi32(tmp1, 8); tmp1 = _mm_add_epi32(sa, tmp1); tmp1 = Multiply32_SSE2(dc, tmp1); __m128i rc1 = _mm_andnot_si128(cmp1, tmp1); // else if (4 * dc <= da) tmp2 = _mm_slli_epi32(dc, 2); // dc * 4 __m128i cmp2 = _mm_cmpgt_epi32(tmp2, da); __m128i i = _mm_slli_epi32(m, 2); // 4 * m __m128i j = _mm_add_epi32(i, _mm_set1_epi32(256)); // 4 * m + 256 __m128i k = Multiply32_SSE2(i, j); // 4 * m * (4 * m + 256) __m128i t = _mm_sub_epi32(m, _mm_set1_epi32(256)); // m - 256 i = Multiply32_SSE2(k, t); // 4 * m * (4 * m + 256) * (m - 256) i = _mm_srai_epi32(i, 16); // >> 16 j = Multiply32_SSE2(_mm_set1_epi32(7), m); // 7 * m tmp2 = _mm_add_epi32(i, j); i = Multiply32_SSE2(dc, sa); // dc * sa j = _mm_slli_epi32(sc, 1); // 2 * sc j = _mm_sub_epi32(j, sa); // 2 * sc - sa j = Multiply32_SSE2(da, j); // da * (2 * sc - sa) tmp2 = Multiply32_SSE2(j, tmp2); // * tmp tmp2 = _mm_srai_epi32(tmp2, 8); // >> 8 tmp2 = _mm_add_epi32(i, tmp2); cmp = _mm_andnot_si128(cmp2, cmp1); __m128i rc2 = _mm_and_si128(cmp, tmp2); __m128i rc = _mm_or_si128(rc1, rc2); // else tmp3 = sqrt_unit_byte_SSE2(m); tmp3 = _mm_sub_epi32(tmp3, m); tmp3 = Multiply32_SSE2(j, tmp3); // j = da * (2 * sc - sa) tmp3 = _mm_srai_epi32(tmp3, 8); tmp3 = _mm_add_epi32(i, tmp3); // i = dc * sa cmp = _mm_and_si128(cmp1, cmp2); __m128i rc3 = _mm_and_si128(cmp, tmp3); rc = _mm_or_si128(rc, rc3); tmp1 = _mm_sub_epi32(_mm_set1_epi32(255), da); // 255 - da tmp1 = _mm_mullo_epi16(sc, tmp1); tmp2 = _mm_sub_epi32(_mm_set1_epi32(255), sa); // 255 - sa tmp2 = _mm_mullo_epi16(dc, tmp2); rc = _mm_add_epi32(rc, tmp1); rc = _mm_add_epi32(rc, tmp2); return clamp_div255round_SSE2(rc); }
static inline __m128i _mm_min_epi8_rpl(__m128i a, __m128i b) { __m128i mask = _mm_cmpgt_epi8(b, a); a = _mm_and_si128(a, mask); b = _mm_andnot_si128(mask, b); return _mm_or_si128(a, b); }
void merge() { #if defined(SSE_MERGE) || defined(SSE_MERGE_UNROLL) __m128i isTrue = _mm_set1_epi16(0xFFFF); #endif for (int i = 0; i < NUM_PAGES; ++i) { //merge in everything thats different between the ref and the latest committed page (that we haven't touched) #ifdef PREFETCH for (int pages = 1; pages <= PREFETCH_PAGES; pages++) { for (int bpp = 0; bpp < PREFETCH_BYTES_PER_PAGE; bpp++) { __builtin_prefetch( &LATEST[i+pages][bpp], 0/*read*/, 3/*high temporal locality*/ ); __builtin_prefetch( &REF[i+pages][bpp], 0/*read*/, 3/*high temporal locality*/ ); // don't prefetch LOCAL since we generally don't need it //__builtin_prefetch( &LOCAL[i+pages][bpp], 1/*write*/, 3/*high temporal locality*/ ); } } #endif #ifdef BYTE_MERGE const char* latest = LATEST[i]; const char* ref = REF[i]; char* local = LOCAL[i]; for (int j = 0; j < PAGE_SIZE; ++j) { if ( unlikely(latest[j]!=ref[j] && local[j]==ref[j]) ){ local[j] = latest[j]; } } #endif #ifdef WORD_MERGE const uint64_t* latest = (const uint64_t*) LATEST[i]; const uint64_t* ref = (const uint64_t*) REF[i]; uint64_t* local = (uint64_t*) LOCAL[i]; for (int j = 0; j < (PAGE_SIZE/sizeof(uint64_t)); ++j) { // check for diff at word granularity first if ( unlikely(latest[j]!=ref[j]) ) { if ( local[j] == ref[j] ) { local[j] = latest[j]; } else { // have to do byte-wise comparison const char* latestChar = (const char*) latest[j]; const char* refChar = (const char*) ref[j]; char* localChar = (char*) local[j]; for ( int k = 0; k < sizeof(uint64_t); k++ ) { if ( latestChar[k] != refChar[k] && localChar[k] == refChar[k] ) { localChar[k] = latestChar[k]; } } } } } #endif #ifdef SSE_MERGE const char* latestP = LATEST[i]; const char* refP = REF[i]; char* localP = LOCAL[i]; for (int j = 0; j < PAGE_SIZE; j += sizeof(__m128i)) { __m128i latest = _mm_load_si128( (__m128i*) (latestP+j) ); __m128i ref = _mm_load_si128( (__m128i*) (refP+j) ); __m128i latEqRef = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones if ( unlikely(!_mm_testc_si128(latEqRef, isTrue)) ) { // some bytes differ __m128i local = _mm_load_si128( (__m128i*) (localP+j) ); __m128i localEqRef = _mm_cmpeq_epi8(local, ref); if ( _mm_testc_si128(localEqRef, isTrue) ) { // local == ref _mm_stream_si128( (__m128i*) (localP+j), latest ); } else { // (~latref) & localref, bytes where lat!=ref && local==ref __m128i latestMask = _mm_andnot_si128( latEqRef, localEqRef ); // new = (latestMask & latest) | (~latestMask & local); __m128i latestBytes = _mm_and_si128(latestMask, latest); __m128i localBytes = _mm_andnot_si128(latestMask, local); latestBytes = _mm_or_si128(latestBytes, localBytes); _mm_stream_si128( (__m128i*) (localP+j), latestBytes ); } } } #endif #ifdef SSE_MERGE_NOBRANCH for (int j = 0; j < PAGE_SIZE; j += sizeof(__m128i)) { __m128i latest = _mm_load_si128( (__m128i*) &LATEST[i][j] ); __m128i ref = _mm_load_si128( (__m128i*) &REF[i][j] ); __m128i local = _mm_load_si128( (__m128i*) &LOCAL[i][j] ); __m128i latref = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones __m128i tmp = _mm_cmpeq_epi8(local, ref); latref = _mm_andnot_si128( latref, tmp ); // (~latref) & localref // update = (latref & latest) | (~latref & local); tmp = _mm_and_si128(latref, latest); __m128i localBytes = _mm_andnot_si128(latref, local); tmp = _mm_or_si128(tmp, localBytes); _mm_stream_si128( (__m128i*) &LOCAL[i][j], tmp ); } #endif #ifdef SSE_MERGE_UNROLL // manually unroll this loop since gcc won't do it; ugh const char* latestP = LATEST[i]; const char* refP = REF[i]; char* localP = LOCAL[i]; for (int j = 0; j < PAGE_SIZE; j += sizeof(__m128i)) { __m128i latest = _mm_load_si128( (__m128i*) (latestP+j) ); __m128i ref = _mm_load_si128( (__m128i*) (refP+j) ); __m128i latEqRef = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones if ( unlikely(!_mm_testc_si128(latEqRef, isTrue)) ) { // some bytes differ __m128i local = _mm_load_si128( (__m128i*) (localP+j) ); __m128i localEqRef = _mm_cmpeq_epi8(local, ref); if ( _mm_testc_si128(localEqRef, isTrue) ) { // local == ref _mm_stream_si128( (__m128i*) (localP+j), latest ); } else { // (~latref) & localref, bytes where lat!=ref && local==ref __m128i latestMask = _mm_andnot_si128( latEqRef, localEqRef ); // new = (latestMask & latest) | (~latestMask & local); __m128i latestBytes = _mm_and_si128(latestMask, latest); __m128i localBytes = _mm_andnot_si128(latestMask, local); latestBytes = _mm_or_si128(latestBytes, localBytes); _mm_stream_si128( (__m128i*) (localP+j), latestBytes ); } } j += sizeof(__m128i); latest = _mm_load_si128( (__m128i*) (latestP+j) ); ref = _mm_load_si128( (__m128i*) (refP+j) ); latEqRef = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones if ( unlikely(!_mm_testc_si128(latEqRef, isTrue)) ) { // some bytes differ __m128i local = _mm_load_si128( (__m128i*) (localP+j) ); __m128i localEqRef = _mm_cmpeq_epi8(local, ref); if ( _mm_testc_si128(localEqRef, isTrue) ) { // local == ref _mm_stream_si128( (__m128i*) (localP+j), latest ); } else { // (~latref) & localref, bytes where lat!=ref && local==ref __m128i latestMask = _mm_andnot_si128( latEqRef, localEqRef ); // new = (latestMask & latest) | (~latestMask & local); __m128i latestBytes = _mm_and_si128(latestMask, latest); __m128i localBytes = _mm_andnot_si128(latestMask, local); latestBytes = _mm_or_si128(latestBytes, localBytes); _mm_stream_si128( (__m128i*) (localP+j), latestBytes ); } } j += sizeof(__m128i); latest = _mm_load_si128( (__m128i*) (latestP+j) ); ref = _mm_load_si128( (__m128i*) (refP+j) ); latEqRef = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones if ( unlikely(!_mm_testc_si128(latEqRef, isTrue)) ) { // some bytes differ __m128i local = _mm_load_si128( (__m128i*) (localP+j) ); __m128i localEqRef = _mm_cmpeq_epi8(local, ref); if ( _mm_testc_si128(localEqRef, isTrue) ) { // local == ref _mm_stream_si128( (__m128i*) (localP+j), latest ); } else { // (~latref) & localref, bytes where lat!=ref && local==ref __m128i latestMask = _mm_andnot_si128( latEqRef, localEqRef ); // new = (latestMask & latest) | (~latestMask & local); __m128i latestBytes = _mm_and_si128(latestMask, latest); __m128i localBytes = _mm_andnot_si128(latestMask, local); latestBytes = _mm_or_si128(latestBytes, localBytes); _mm_stream_si128( (__m128i*) (localP+j), latestBytes ); } } } #endif } }
/* ================== ================== */ void pixel_shader( const unsigned __int32 i_buffer, const unsigned __int32 coverage_mask, const __m128i bazza[3][4], shader_input_& shader_input ) { static const __m128 zero = set_zero(); static const __m128 half = set_all(0.5f); static const __m128 one = set_all(1.0f); static const __m128 two = one + one; static const __m128 three = two + one; static const __m128i zero_int = set_zero_si128(); static const __m128 colour_clamp = broadcast(load_s(255.0f)); unsigned __int32 depth_mask = 0x0; __m128 w_screen[2][4]; w_screen[0][0] = convert_float(bazza[0][0]) * shader_input.r_area; w_screen[0][1] = convert_float(bazza[0][1]) * shader_input.r_area; w_screen[0][2] = convert_float(bazza[0][2]) * shader_input.r_area; w_screen[0][3] = convert_float(bazza[0][3]) * shader_input.r_area; w_screen[1][0] = convert_float(bazza[1][0]) * shader_input.r_area; w_screen[1][1] = convert_float(bazza[1][1]) * shader_input.r_area; w_screen[1][2] = convert_float(bazza[1][2]) * shader_input.r_area; w_screen[1][3] = convert_float(bazza[1][3]) * shader_input.r_area; __m128 z_screen[4]; z_screen[0] = (shader_input.z_delta[X] * w_screen[0][0]) + (shader_input.z_delta[Y] * w_screen[1][0]) + shader_input.z_delta[Z]; z_screen[1] = (shader_input.z_delta[X] * w_screen[0][1]) + (shader_input.z_delta[Y] * w_screen[1][1]) + shader_input.z_delta[Z]; z_screen[2] = (shader_input.z_delta[X] * w_screen[0][2]) + (shader_input.z_delta[Y] * w_screen[1][2]) + shader_input.z_delta[Z]; z_screen[3] = (shader_input.z_delta[X] * w_screen[0][3]) + (shader_input.z_delta[Y] * w_screen[1][3]) + shader_input.z_delta[Z]; { //if (shader_input.is_test) { // __m128 x = convert_float(set_all(shader_input.x)); // __m128 y = convert_float(set_all(shader_input.y)); // y += set_all(0.5f); // x += set_all(0.5f); // x += set(0.0f, 1.0f, 2.0f, 3.0f); // __m128 y_block[4]; // y_block[0] = y; // y_block[1] = y + one; // y_block[2] = y + two; // y_block[3] = y + three; // __m128 z_interpolant[3]; // z_interpolant[X] = set_all(shader_input.depth_interpolants[X]); // z_interpolant[Y] = set_all(shader_input.depth_interpolants[Y]); // z_interpolant[Z] = set_all(shader_input.depth_interpolants[Z]); // z_screen[0] = (z_interpolant[X] * x) + (z_interpolant[Y] * y_block[0]) + z_interpolant[Z]; // z_screen[1] = (z_interpolant[X] * x) + (z_interpolant[Y] * y_block[1]) + z_interpolant[Z]; // z_screen[2] = (z_interpolant[X] * x) + (z_interpolant[Y] * y_block[2]) + z_interpolant[Z]; // z_screen[3] = (z_interpolant[X] * x) + (z_interpolant[Y] * y_block[3]) + z_interpolant[Z]; //} } __m128i pixel_mask[4]; pixel_mask[0] = load_mask[(coverage_mask >> 0) & 0xf]; pixel_mask[1] = load_mask[(coverage_mask >> 4) & 0xf]; pixel_mask[2] = load_mask[(coverage_mask >> 8) & 0xf]; pixel_mask[3] = load_mask[(coverage_mask >> 12) & 0xf]; __m128 z_buffer[4]; z_buffer[0] = load(shader_input.depth_buffer + i_buffer + 0); z_buffer[1] = load(shader_input.depth_buffer + i_buffer + 4); z_buffer[2] = load(shader_input.depth_buffer + i_buffer + 8); z_buffer[3] = load(shader_input.depth_buffer + i_buffer + 12); __m128i z_mask[4]; z_mask[0] = (z_screen[0] > z_buffer[0]) & pixel_mask[0]; z_mask[1] = (z_screen[1] > z_buffer[1]) & pixel_mask[1]; z_mask[2] = (z_screen[2] > z_buffer[2]) & pixel_mask[2]; z_mask[3] = (z_screen[3] > z_buffer[3]) & pixel_mask[3]; depth_mask |= store_mask(z_mask[0]) << 0; depth_mask |= store_mask(z_mask[1]) << 4; depth_mask |= store_mask(z_mask[2]) << 8; depth_mask |= store_mask(z_mask[3]) << 12; __m128 z_write[4]; z_write[0] = blend(z_screen[0], z_buffer[0], z_mask[0]); z_write[1] = blend(z_screen[1], z_buffer[1], z_mask[1]); z_write[2] = blend(z_screen[2], z_buffer[2], z_mask[2]); z_write[3] = blend(z_screen[3], z_buffer[3], z_mask[3]); { __m128 z_max; z_max = z_write[0]; z_max = min_vec(z_write[1], z_max); z_max = min_vec(z_write[2], z_max); z_max = min_vec(z_write[3], z_max); __m128 z_out = z_max; z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); z_max = rotate_left(z_max); z_out = min_vec(z_max, z_out); shader_input.z_max = store_s(z_out); } store(z_write[0], shader_input.depth_buffer + i_buffer + 0); store(z_write[1], shader_input.depth_buffer + i_buffer + 4); store(z_write[2], shader_input.depth_buffer + i_buffer + 8); store(z_write[3], shader_input.depth_buffer + i_buffer + 12); if (depth_mask == 0x0) { return; } __m128 screen_barry[2][4]; screen_barry[0][0] = (w_screen[0][0] * shader_input.barycentric[0][X]) + (w_screen[1][0] * shader_input.barycentric[0][Y]) + shader_input.barycentric[0][Z]; screen_barry[0][1] = (w_screen[0][1] * shader_input.barycentric[0][X]) + (w_screen[1][1] * shader_input.barycentric[0][Y]) + shader_input.barycentric[0][Z]; screen_barry[0][2] = (w_screen[0][2] * shader_input.barycentric[0][X]) + (w_screen[1][2] * shader_input.barycentric[0][Y]) + shader_input.barycentric[0][Z]; screen_barry[0][3] = (w_screen[0][3] * shader_input.barycentric[0][X]) + (w_screen[1][3] * shader_input.barycentric[0][Y]) + shader_input.barycentric[0][Z]; screen_barry[1][0] = (w_screen[0][0] * shader_input.barycentric[1][X]) + (w_screen[1][0] * shader_input.barycentric[1][Y]) + shader_input.barycentric[1][Z]; screen_barry[1][1] = (w_screen[0][1] * shader_input.barycentric[1][X]) + (w_screen[1][1] * shader_input.barycentric[1][Y]) + shader_input.barycentric[1][Z]; screen_barry[1][2] = (w_screen[0][2] * shader_input.barycentric[1][X]) + (w_screen[1][2] * shader_input.barycentric[1][Y]) + shader_input.barycentric[1][Z]; screen_barry[1][3] = (w_screen[0][3] * shader_input.barycentric[1][X]) + (w_screen[1][3] * shader_input.barycentric[1][Y]) + shader_input.barycentric[1][Z]; __m128 r_depth[4]; r_depth[0] = reciprocal(z_screen[0]); r_depth[1] = reciprocal(z_screen[1]); r_depth[2] = reciprocal(z_screen[2]); r_depth[3] = reciprocal(z_screen[3]); __m128 w_clip[2][4]; w_clip[0][0] = screen_barry[0][0] * r_depth[0]; w_clip[0][1] = screen_barry[0][1] * r_depth[1]; w_clip[0][2] = screen_barry[0][2] * r_depth[2]; w_clip[0][3] = screen_barry[0][3] * r_depth[3]; w_clip[1][0] = screen_barry[1][0] * r_depth[0]; w_clip[1][1] = screen_barry[1][1] * r_depth[1]; w_clip[1][2] = screen_barry[1][2] * r_depth[2]; w_clip[1][3] = screen_barry[1][3] * r_depth[3]; __m128i colour_out[4]; { const vertex4_* gradients = shader_input.gradients[ATTRIBUTE_COLOUR]; __m128 red_float[4]; red_float[0] = (gradients[R].x * w_clip[0][0]) + (gradients[R].y * w_clip[1][0]) + gradients[R].z; red_float[1] = (gradients[R].x * w_clip[0][1]) + (gradients[R].y * w_clip[1][1]) + gradients[R].z; red_float[2] = (gradients[R].x * w_clip[0][2]) + (gradients[R].y * w_clip[1][2]) + gradients[R].z; red_float[3] = (gradients[R].x * w_clip[0][3]) + (gradients[R].y * w_clip[1][3]) + gradients[R].z; __m128 green_float[4]; green_float[0] = (gradients[G].x * w_clip[0][0]) + (gradients[G].y * w_clip[1][0]) + gradients[G].z; green_float[1] = (gradients[G].x * w_clip[0][1]) + (gradients[G].y * w_clip[1][1]) + gradients[G].z; green_float[2] = (gradients[G].x * w_clip[0][2]) + (gradients[G].y * w_clip[1][2]) + gradients[G].z; green_float[3] = (gradients[G].x * w_clip[0][3]) + (gradients[G].y * w_clip[1][3]) + gradients[G].z; __m128 blue_float[4]; blue_float[0] = (gradients[B].x * w_clip[0][0]) + (gradients[B].y * w_clip[1][0]) + gradients[B].z; blue_float[1] = (gradients[B].x * w_clip[0][1]) + (gradients[B].y * w_clip[1][1]) + gradients[B].z; blue_float[2] = (gradients[B].x * w_clip[0][2]) + (gradients[B].y * w_clip[1][2]) + gradients[B].z; blue_float[3] = (gradients[B].x * w_clip[0][3]) + (gradients[B].y * w_clip[1][3]) + gradients[B].z; red_float[0] = min_vec(max_vec(red_float[0], zero), colour_clamp); red_float[1] = min_vec(max_vec(red_float[1], zero), colour_clamp); red_float[2] = min_vec(max_vec(red_float[2], zero), colour_clamp); red_float[3] = min_vec(max_vec(red_float[3], zero), colour_clamp); green_float[0] = min_vec(max_vec(green_float[0], zero), colour_clamp); green_float[1] = min_vec(max_vec(green_float[1], zero), colour_clamp); green_float[2] = min_vec(max_vec(green_float[2], zero), colour_clamp); green_float[3] = min_vec(max_vec(green_float[3], zero), colour_clamp); blue_float[0] = min_vec(max_vec(blue_float[0], zero), colour_clamp); blue_float[1] = min_vec(max_vec(blue_float[1], zero), colour_clamp); blue_float[2] = min_vec(max_vec(blue_float[2], zero), colour_clamp); blue_float[3] = min_vec(max_vec(blue_float[3], zero), colour_clamp); __m128i red_int[4]; red_int[0] = convert_int_trunc(red_float[0]); red_int[1] = convert_int_trunc(red_float[1]); red_int[2] = convert_int_trunc(red_float[2]); red_int[3] = convert_int_trunc(red_float[3]); __m128i green_int[4]; green_int[0] = convert_int_trunc(green_float[0]); green_int[1] = convert_int_trunc(green_float[1]); green_int[2] = convert_int_trunc(green_float[2]); green_int[3] = convert_int_trunc(green_float[3]); __m128i blue_int[4]; blue_int[0] = convert_int_trunc(blue_float[0]); blue_int[1] = convert_int_trunc(blue_float[1]); blue_int[2] = convert_int_trunc(blue_float[2]); blue_int[3] = convert_int_trunc(blue_float[3]); colour_out[0] = red_int[0] | (green_int[0] << 8) | (blue_int[0] << 16); colour_out[1] = red_int[1] | (green_int[1] << 8) | (blue_int[1] << 16); colour_out[2] = red_int[2] | (green_int[2] << 8) | (blue_int[2] << 16); colour_out[3] = red_int[3] | (green_int[3] << 8) | (blue_int[3] << 16); } float4_ u_table[4]; float4_ v_table[4]; { const vertex4_* gradients = shader_input.gradients[ATTRIBUTE_TEXCOORD]; __m128 u_axis[4]; u_axis[0] = (gradients[U].x * w_clip[0][0]) + (gradients[U].y * w_clip[1][0]) + gradients[U].z; u_axis[1] = (gradients[U].x * w_clip[0][1]) + (gradients[U].y * w_clip[1][1]) + gradients[U].z; u_axis[2] = (gradients[U].x * w_clip[0][2]) + (gradients[U].y * w_clip[1][2]) + gradients[U].z; u_axis[3] = (gradients[U].x * w_clip[0][3]) + (gradients[U].y * w_clip[1][3]) + gradients[U].z; __m128 v_axis[4]; v_axis[0] = (gradients[V].x * w_clip[0][0]) + (gradients[V].y * w_clip[1][0]) + gradients[V].z; v_axis[1] = (gradients[V].x * w_clip[0][1]) + (gradients[V].y * w_clip[1][1]) + gradients[V].z; v_axis[2] = (gradients[V].x * w_clip[0][2]) + (gradients[V].y * w_clip[1][2]) + gradients[V].z; v_axis[3] = (gradients[V].x * w_clip[0][3]) + (gradients[V].y * w_clip[1][3]) + gradients[V].z; store_u(u_axis[0], u_table[0].f); store_u(u_axis[1], u_table[1].f); store_u(u_axis[2], u_table[2].f); store_u(u_axis[3], u_table[3].f); store_u(v_axis[0], v_table[0].f); store_u(v_axis[1], v_table[1].f); store_u(v_axis[2], v_table[2].f); store_u(v_axis[3], v_table[3].f); } const texture_handler_& texture_handler = *shader_input.texture_handler; float2_ du; du.x = (u_table[0].f[3] - u_table[0].f[0]) * (float)texture_handler.width; du.y = (u_table[3].f[0] - u_table[0].f[0]) * (float)texture_handler.width; float2_ dv; dv.x = (v_table[0].f[3] - v_table[0].f[0]) * (float)texture_handler.height; dv.y = (v_table[3].f[0] - v_table[0].f[0]) * (float)texture_handler.height; float area = abs((du.x * dv.y) - (du.y * dv.x)) * shader_input.mip_level_bias; unsigned long area_int = 1 + (unsigned long)(area + 0.5f); __int32 i_mip_floor; _BitScanReverse((unsigned long*)&i_mip_floor, area_int); i_mip_floor = max(i_mip_floor, 0); i_mip_floor = min(i_mip_floor, texture_handler.n_mip_levels - 1); const __int32 width = texture_handler.width >> i_mip_floor; const __int32 height = texture_handler.height >> i_mip_floor; const __int32 shift = texture_handler.width_shift - i_mip_floor; const __m128i texture_width_int = set_all(width); const __m128 texture_width = convert_float(set_all(width)); const __m128 texture_height = convert_float(set_all(height)); const __m128i width_clamp = set_all(width - 1); const __m128i height_clamp = set_all(height - 1); const __m128i width_shift = load_s(shift); __m128i tex_out[4]; { __m128 u_axis[4]; u_axis[0] = (load_u(u_table[0].f) * texture_width); // - half; u_axis[1] = (load_u(u_table[1].f) * texture_width); // - half; u_axis[2] = (load_u(u_table[2].f) * texture_width); // - half; u_axis[3] = (load_u(u_table[3].f) * texture_width); // - half; __m128 v_axis[4]; v_axis[0] = (load_u(v_table[0].f) * texture_height); // - half; v_axis[1] = (load_u(v_table[1].f) * texture_height); // - half; v_axis[2] = (load_u(v_table[2].f) * texture_height); // - half; v_axis[3] = (load_u(v_table[3].f) * texture_height); // - half; __m128i u_int[4]; u_int[0] = convert_int_trunc(u_axis[0]); u_int[1] = convert_int_trunc(u_axis[1]); u_int[2] = convert_int_trunc(u_axis[2]); u_int[3] = convert_int_trunc(u_axis[3]); __m128i v_int[4]; v_int[0] = convert_int_trunc(v_axis[0]); v_int[1] = convert_int_trunc(v_axis[1]); v_int[2] = convert_int_trunc(v_axis[2]); v_int[3] = convert_int_trunc(v_axis[3]); u_int[0] = max_vec(min_vec(u_int[0], width_clamp), zero_int); u_int[1] = max_vec(min_vec(u_int[1], width_clamp), zero_int); u_int[2] = max_vec(min_vec(u_int[2], width_clamp), zero_int); u_int[3] = max_vec(min_vec(u_int[3], width_clamp), zero_int); v_int[0] = max_vec(min_vec(v_int[0], height_clamp), zero_int); v_int[1] = max_vec(min_vec(v_int[1], height_clamp), zero_int); v_int[2] = max_vec(min_vec(v_int[2], height_clamp), zero_int); v_int[3] = max_vec(min_vec(v_int[3], height_clamp), zero_int); __m128i i_texels[4]; i_texels[0] = u_int[0] + (v_int[0] * texture_width_int); i_texels[1] = u_int[1] + (v_int[1] * texture_width_int); i_texels[2] = u_int[2] + (v_int[2] * texture_width_int); i_texels[3] = u_int[3] + (v_int[3] * texture_width_int); __int32 i_texels_in[4][4]; store_u(i_texels[0], i_texels_in[0]); store_u(i_texels[1], i_texels_in[1]); store_u(i_texels[2], i_texels_in[2]); store_u(i_texels[3], i_texels_in[3]); unsigned __int32 texels_out[4][4]; texels_out[0][0] = texture_handler.texture[i_mip_floor][i_texels_in[0][0]]; texels_out[0][1] = texture_handler.texture[i_mip_floor][i_texels_in[0][1]]; texels_out[0][2] = texture_handler.texture[i_mip_floor][i_texels_in[0][2]]; texels_out[0][3] = texture_handler.texture[i_mip_floor][i_texels_in[0][3]]; texels_out[1][0] = texture_handler.texture[i_mip_floor][i_texels_in[1][0]]; texels_out[1][1] = texture_handler.texture[i_mip_floor][i_texels_in[1][1]]; texels_out[1][2] = texture_handler.texture[i_mip_floor][i_texels_in[1][2]]; texels_out[1][3] = texture_handler.texture[i_mip_floor][i_texels_in[1][3]]; texels_out[2][0] = texture_handler.texture[i_mip_floor][i_texels_in[2][0]]; texels_out[2][1] = texture_handler.texture[i_mip_floor][i_texels_in[2][1]]; texels_out[2][2] = texture_handler.texture[i_mip_floor][i_texels_in[2][2]]; texels_out[2][3] = texture_handler.texture[i_mip_floor][i_texels_in[2][3]]; texels_out[3][0] = texture_handler.texture[i_mip_floor][i_texels_in[3][0]]; texels_out[3][1] = texture_handler.texture[i_mip_floor][i_texels_in[3][1]]; texels_out[3][2] = texture_handler.texture[i_mip_floor][i_texels_in[3][2]]; texels_out[3][3] = texture_handler.texture[i_mip_floor][i_texels_in[3][3]]; tex_out[0] = load_u(texels_out[0]); tex_out[1] = load_u(texels_out[1]); tex_out[2] = load_u(texels_out[2]); tex_out[3] = load_u(texels_out[3]); } __m128i colour_buffer[4]; colour_buffer[0] = load(shader_input.colour_buffer + i_buffer + 0); colour_buffer[1] = load(shader_input.colour_buffer + i_buffer + 4); colour_buffer[2] = load(shader_input.colour_buffer + i_buffer + 8); colour_buffer[3] = load(shader_input.colour_buffer + i_buffer + 12); colour_buffer[0] = _mm_andnot_si128(z_mask[0], colour_buffer[0]); colour_buffer[1] = _mm_andnot_si128(z_mask[1], colour_buffer[1]); colour_buffer[2] = _mm_andnot_si128(z_mask[2], colour_buffer[2]); colour_buffer[3] = _mm_andnot_si128(z_mask[3], colour_buffer[3]); colour_buffer[0] = add_uint8_saturate(colour_buffer[0], colour_out[0] & z_mask[0]); colour_buffer[1] = add_uint8_saturate(colour_buffer[1], colour_out[1] & z_mask[1]); colour_buffer[2] = add_uint8_saturate(colour_buffer[2], colour_out[2] & z_mask[2]); colour_buffer[3] = add_uint8_saturate(colour_buffer[3], colour_out[3] & z_mask[3]); colour_buffer[0] = add_uint8_saturate(colour_buffer[0], tex_out[0] & z_mask[0]); colour_buffer[1] = add_uint8_saturate(colour_buffer[1], tex_out[1] & z_mask[1]); colour_buffer[2] = add_uint8_saturate(colour_buffer[2], tex_out[2] & z_mask[2]); colour_buffer[3] = add_uint8_saturate(colour_buffer[3], tex_out[3] & z_mask[3]); store(colour_buffer[0], shader_input.colour_buffer + i_buffer + 0); store(colour_buffer[1], shader_input.colour_buffer + i_buffer + 4); store(colour_buffer[2], shader_input.colour_buffer + i_buffer + 8); store(colour_buffer[3], shader_input.colour_buffer + i_buffer + 12); }
void sincos_ps(__m128 x, __m128 *s, __m128 *c) { __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; __m128i emm0, emm2, emm4; sign_bit_sin = x; x = _mm_and_ps(x, *reinterpret_cast<const __m128*>(_pi_inv_sign_mask)); sign_bit_sin = _mm_and_ps(sign_bit_sin, *reinterpret_cast<const __m128*>(_pi_sign_mask)); y = _mm_mul_ps(x, *_ps_cephes_FOPI); emm2 = _mm_cvttps_epi32(y); emm2 = _mm_add_epi32(emm2, *_pi_1); emm2 = _mm_and_si128(emm2, *_pi_inv1); y = _mm_cvtepi32_ps(emm2); emm4 = emm2; emm0 = _mm_and_si128(emm2, *_pi_4); emm0 = _mm_slli_epi32(emm0, 29); __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0); emm2 = _mm_and_si128(emm2, *_pi_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 poly_mask = _mm_castsi128_ps(emm2); xmm1 = *_ps_minus_cephes_DP1; xmm2 = *_ps_minus_cephes_DP2; xmm3 = *_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); emm4 = _mm_sub_epi32(emm4, *_pi_2); emm4 = _mm_andnot_si128(emm4, *_pi_4); emm4 = _mm_slli_epi32(emm4, 29); __m128 sign_bit_cos = _mm_castsi128_ps(emm4); sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); __m128 z = _mm_mul_ps(x, x); y = *_ps_coscof_p0; y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *_ps_coscof_p1); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); __m128 tmp = _mm_mul_ps(z, *_ps_0p5); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, *_ps_1); __m128 y2 = *_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); xmm3 = poly_mask; __m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin1 = _mm_andnot_ps(xmm3, y); y2 = _mm_sub_ps(y2, ysin2); y = _mm_sub_ps(y, ysin1); xmm1 = _mm_add_ps(ysin1, ysin2); xmm2 = _mm_add_ps(y, y2); *s = _mm_xor_ps(xmm1, sign_bit_sin); *c = _mm_xor_ps(xmm2, sign_bit_cos); }
__m128i aes_ssse3_encrypt(__m128i B, const __m128i* keys, size_t rounds) { const __m128i sb2u = _mm_set_epi32( 0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400); const __m128i sb2t = _mm_set_epi32( 0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900); const __m128i sbou = _mm_set_epi32( 0x15AABF7A, 0xC502A878, 0xD0D26D17, 0x6FBDC700); const __m128i sbot = _mm_set_epi32( 0x8E1E90D1, 0x412B35FA, 0xCFE474A5, 0x5FBB6A00); const __m128i mc_backward[4] = { _mm_set_epi32(0x0E0D0C0F, 0x0A09080B, 0x06050407, 0x02010003), _mm_set_epi32(0x0A09080B, 0x06050407, 0x02010003, 0x0E0D0C0F), _mm_set_epi32(0x06050407, 0x02010003, 0x0E0D0C0F, 0x0A09080B), _mm_set_epi32(0x02010003, 0x0E0D0C0F, 0x0A09080B, 0x06050407), }; B = mm_xor3(_mm_shuffle_epi8(k_ipt1, _mm_and_si128(low_nibs, B)), _mm_shuffle_epi8(k_ipt2, _mm_srli_epi32( _mm_andnot_si128(low_nibs, B), 4)), _mm_loadu_si128(keys)); for(size_t r = 1; ; ++r) { const __m128i K = _mm_loadu_si128(keys + r); __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4); B = _mm_and_si128(low_nibs, B); __m128i t2 = _mm_shuffle_epi8(k_inv2, B); B = _mm_xor_si128(B, t); __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t)); __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B)); __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3)); __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4)); if(r == rounds) { B = _mm_shuffle_epi8( mm_xor3(_mm_shuffle_epi8(sbou, t5), _mm_shuffle_epi8(sbot, t6), K), sr[r % 4]); return B; } __m128i t7 = mm_xor3(_mm_shuffle_epi8(sb1t, t6), _mm_shuffle_epi8(sb1u, t5), K); __m128i t8 = mm_xor3(_mm_shuffle_epi8(sb2t, t6), _mm_shuffle_epi8(sb2u, t5), _mm_shuffle_epi8(t7, mc_forward[r % 4])); B = mm_xor3(_mm_shuffle_epi8(t8, mc_forward[r % 4]), _mm_shuffle_epi8(t7, mc_backward[r % 4]), t8); } }
__m128i aes_ssse3_decrypt(__m128i B, const __m128i* keys, size_t rounds) { const __m128i k_dipt1 = _mm_set_epi32( 0x154A411E, 0x114E451A, 0x0F505B04, 0x0B545F00); const __m128i k_dipt2 = _mm_set_epi32( 0x12771772, 0xF491F194, 0x86E383E6, 0x60056500); const __m128i sb9u = _mm_set_epi32( 0xCAD51F50, 0x4F994CC9, 0x851C0353, 0x9A86D600); const __m128i sb9t = _mm_set_epi32( 0x725E2C9E, 0xB2FBA565, 0xC03B1789, 0xECD74900); const __m128i sbeu = _mm_set_epi32( 0x22426004, 0x64B4F6B0, 0x46F29296, 0x26D4D000); const __m128i sbet = _mm_set_epi32( 0x9467F36B, 0x98593E32, 0x0C55A6CD, 0xFFAAC100); const __m128i sbdu = _mm_set_epi32( 0xF56E9B13, 0x882A4439, 0x7D57CCDF, 0xE6B1A200); const __m128i sbdt = _mm_set_epi32( 0x2931180D, 0x15DEEFD3, 0x3CE2FAF7, 0x24C6CB00); const __m128i sbbu = _mm_set_epi32( 0x602646F6, 0xB0F2D404, 0xD0226492, 0x96B44200); const __m128i sbbt = _mm_set_epi32( 0xF3FF0C3E, 0x3255AA6B, 0xC19498A6, 0xCD596700); __m128i mc = mc_forward[3]; __m128i t = _mm_shuffle_epi8(k_dipt2, _mm_srli_epi32( _mm_andnot_si128(low_nibs, B), 4)); B = mm_xor3(t, _mm_loadu_si128(keys), _mm_shuffle_epi8(k_dipt1, _mm_and_si128(B, low_nibs))); for(size_t r = 1; ; ++r) { const __m128i K = _mm_loadu_si128(keys + r); t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4); B = _mm_and_si128(low_nibs, B); __m128i t2 = _mm_shuffle_epi8(k_inv2, B); B = _mm_xor_si128(B, t); __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t)); __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B)); __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3)); __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4)); if(r == rounds) { const __m128i sbou = _mm_set_epi32( 0xC7AA6DB9, 0xD4943E2D, 0x1387EA53, 0x7EF94000); const __m128i sbot = _mm_set_epi32( 0xCA4B8159, 0xD8C58E9C, 0x12D7560F, 0x93441D00); __m128i x = _mm_shuffle_epi8(sbou, t5); __m128i y = _mm_shuffle_epi8(sbot, t6); x = _mm_xor_si128(x, K); x = _mm_xor_si128(x, y); const uint32_t which_sr = ((((rounds - 1) << 4) ^ 48) & 48) / 16; return _mm_shuffle_epi8(x, sr[which_sr]); } __m128i t8 = _mm_xor_si128(_mm_shuffle_epi8(sb9t, t6), _mm_xor_si128(_mm_shuffle_epi8(sb9u, t5), K)); __m128i t9 = mm_xor3(_mm_shuffle_epi8(t8, mc), _mm_shuffle_epi8(sbdu, t5), _mm_shuffle_epi8(sbdt, t6)); __m128i t12 = _mm_xor_si128( _mm_xor_si128( _mm_shuffle_epi8(t9, mc), _mm_shuffle_epi8(sbbu, t5)), _mm_shuffle_epi8(sbbt, t6)); B = _mm_xor_si128(_mm_xor_si128(_mm_shuffle_epi8(t12, mc), _mm_shuffle_epi8(sbeu, t5)), _mm_shuffle_epi8(sbet, t6)); mc = _mm_alignr_epi8(mc, mc, 12); } }
// this function performs precise calculations void PreOver_SSE2(void* dest, const void* source1, const void* source2, size_t size) { static const size_t stride = sizeof(__m128i)*4; static const u32 PSD = 64; static const __m128i round = _mm_set1_epi16(128); static const __m128i lomask = _mm_set1_epi32(0x00FF00FF); assert(source1 != NULL && source2 != NULL && dest != NULL); assert(size % stride == 0); const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1); const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2); __m128i* dest128 = reinterpret_cast<__m128i*>(dest); __m128i d, s, a, rb, ag, t; // TODO: dynamic prefetch schedluing distance? needs to be optimized (R.N) for(size_t k = 0, length = size/stride; k < length; ++k) { // TODO: put prefetch between calculations?(R.N) _mm_prefetch(reinterpret_cast<const s8*>(source128_1+PSD), _MM_HINT_NTA); _mm_prefetch(reinterpret_cast<const s8*>(source128_2+PSD), _MM_HINT_NTA); // work on entire cacheline before next prefetch for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2) { // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/ // TODO: load entire cacheline at the same time? are there enough registers? 32 bit mode (special compile for 64bit?) (R.N) s = _mm_load_si128(source128_1); // AABGGRR d = _mm_load_si128(source128_2); // AABGGRR // PRELERP(S, D) = S+D - ((S*D[A]+0x80)>>8)+(S*D[A]+0x80))>>8 // T = S*D[A]+0x80 => PRELERP(S,D) = S+D - ((T>>8)+T)>>8 // set alpha to lo16 from dest_ a = _mm_srli_epi32(d, 24); // 000000AA rb = _mm_slli_epi32(a, 16); // 00AA0000 a = _mm_or_si128(rb, a); // 00AA00AA rb = _mm_and_si128(lomask, s); // 00BB00RR rb = _mm_mullo_epi16(rb, a); // BBBBRRRR rb = _mm_add_epi16(rb, round); // BBBBRRRR t = _mm_srli_epi16(rb, 8); // 00BB00RR t = _mm_add_epi16(t, rb); rb = _mm_srli_epi16(t, 8); ag = _mm_srli_epi16(s, 8); // 00AA00GG ag = _mm_mullo_epi16(ag, a); // AAAAGGGG ag = _mm_add_epi16(ag, round); t = _mm_srli_epi16(ag, 8); t = _mm_add_epi16(t, ag); ag = _mm_andnot_si128(lomask, t); // AA00GG00 rb = _mm_or_si128(rb, ag); // AABGGRR pack rb = _mm_sub_epi8(s, rb); // sub S-[(D[A]*S)/255] d = _mm_add_epi8(d, rb); // add D+[S-(D[A]*S)/255] _mm_store_si128(dest128, d); } } }
static FORCE_INLINE void warp_mmword_u8_sse2(const uint8_t *srcp, const uint8_t *edgep, uint8_t *dstp, int src_stride, int edge_stride, int height, int x, int y, const __m128i &depth, const __m128i &zero, const __m128i &x_limit_min, const __m128i &x_limit_max, const __m128i &y_limit_min, const __m128i &y_limit_max, const __m128i &word_64, const __m128i &word_127, const __m128i &word_128, const __m128i &word_255, const __m128i &one_stride) { int SMAG = 1 << SMAGL; // calculate displacement __m128i above = _mm_loadl_epi64((const __m128i *)(edgep + x - (y ? edge_stride : 0))); __m128i below = _mm_loadl_epi64((const __m128i *)(edgep + x + (y < height - 1 ? edge_stride : 0))); __m128i left = _mm_loadl_epi64((const __m128i *)(edgep + x - 1)); __m128i right = _mm_loadl_epi64((const __m128i *)(edgep + x + 1)); above = _mm_unpacklo_epi8(above, zero); below = _mm_unpacklo_epi8(below, zero); left = _mm_unpacklo_epi8(left, zero); right = _mm_unpacklo_epi8(right, zero); __m128i h = _mm_sub_epi16(left, right); __m128i v = _mm_sub_epi16(above, below); h = _mm_slli_epi16(h, 7); v = _mm_slli_epi16(v, 7); h = _mm_mulhi_epi16(h, depth); v = _mm_mulhi_epi16(v, depth); v = _mm_max_epi16(v, y_limit_min); v = _mm_min_epi16(v, y_limit_max); __m128i remainder_h = h; __m128i remainder_v = v; if (SMAGL) { remainder_h = _mm_slli_epi16(remainder_h, SMAGL); remainder_v = _mm_slli_epi16(remainder_v, SMAGL); } remainder_h = _mm_and_si128(remainder_h, word_127); remainder_v = _mm_and_si128(remainder_v, word_127); h = _mm_srai_epi16(h, 7 - SMAGL); v = _mm_srai_epi16(v, 7 - SMAGL); __m128i xx = _mm_set1_epi32(x << SMAGL); xx = _mm_packs_epi32(xx, xx); h = _mm_adds_epi16(h, xx); remainder_h = _mm_and_si128(remainder_h, _mm_cmpgt_epi16(x_limit_max, h)); remainder_h = _mm_andnot_si128(_mm_cmpgt_epi16(x_limit_min, h), remainder_h); h = _mm_max_epi16(h, x_limit_min); h = _mm_min_epi16(h, x_limit_max); // h and v contain the displacement now. __m128i disp_lo = _mm_unpacklo_epi16(v, h); __m128i disp_hi = _mm_unpackhi_epi16(v, h); disp_lo = _mm_madd_epi16(disp_lo, one_stride); disp_hi = _mm_madd_epi16(disp_hi, one_stride); __m128i line0 = _mm_setzero_si128(); __m128i line1 = _mm_setzero_si128(); int offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset), 0); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride), 0); offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 1 * SMAG), 1); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 1 * SMAG), 1); offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 2 * SMAG), 2); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 2 * SMAG), 2); offset = _mm_cvtsi128_si32(disp_lo); disp_lo = _mm_srli_si128(disp_lo, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 3 * SMAG), 3); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 3 * SMAG), 3); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 4 * SMAG), 4); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 4 * SMAG), 4); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 5 * SMAG), 5); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 5 * SMAG), 5); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 6 * SMAG), 6); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 6 * SMAG), 6); offset = _mm_cvtsi128_si32(disp_hi); disp_hi = _mm_srli_si128(disp_hi, 4); line0 = _mm_insert_epi16(line0, *(int16_t *)(srcp + offset + 7 * SMAG), 7); line1 = _mm_insert_epi16(line1, *(int16_t *)(srcp + offset + src_stride + 7 * SMAG), 7); __m128i left0 = _mm_and_si128(line0, word_255); __m128i left1 = _mm_and_si128(line1, word_255); __m128i right0 = _mm_srli_epi16(line0, 8); __m128i right1 = _mm_srli_epi16(line1, 8); left0 = _mm_mullo_epi16(left0, _mm_sub_epi16(word_128, remainder_h)); left1 = _mm_mullo_epi16(left1, _mm_sub_epi16(word_128, remainder_h)); right0 = _mm_mullo_epi16(right0, remainder_h); right1 = _mm_mullo_epi16(right1, remainder_h); line0 = _mm_add_epi16(left0, right0); line1 = _mm_add_epi16(left1, right1); line0 = _mm_add_epi16(line0, word_64); line1 = _mm_add_epi16(line1, word_64); line0 = _mm_srai_epi16(line0, 7); line1 = _mm_srai_epi16(line1, 7); line0 = _mm_mullo_epi16(line0, _mm_sub_epi16(word_128, remainder_v)); line1 = _mm_mullo_epi16(line1, remainder_v); __m128i result = _mm_add_epi16(line0, line1); result = _mm_add_epi16(result, word_64); result = _mm_srai_epi16(result, 7); result = _mm_packus_epi16(result, result); _mm_storel_epi64((__m128i *)(dstp + x), result); }
static void GF_FUNC_ALIGN VS_CC proc_8bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *dstp, const uint8_t *srcp, edge_t *eh, uint16_t plane_max) { uint8_t* p0 = buff + 16; uint8_t* p1 = p0 + bstride; uint8_t* p2 = p1 + bstride; uint8_t* p3 = p2 + bstride; uint8_t* p4 = p3 + bstride; uint8_t* orig = p0; uint8_t* end = p4; line_copy8(p0, srcp + 2 * stride, width, 2); line_copy8(p1, srcp + stride, width, 2); line_copy8(p2, srcp, width, 2); srcp += stride; line_copy8(p3, srcp, width, 2); uint8_t th_min = eh->min > 0xFF ? 0xFF : (uint8_t)eh->min; uint8_t th_max = eh->max > 0xFF ? 0xFF : (uint8_t)eh->max; __m128i zero = _mm_setzero_si128(); __m128i ab = _mm_set1_epi16(15); __m128i max = _mm_set1_epi8((int8_t)th_max); __m128i min = _mm_set1_epi8((int8_t)th_min); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy8(p4, srcp, width, 2); uint8_t* posh[] = {p2 - 2, p2 - 1, p2 + 1, p2 + 2}; uint8_t* posv[] = {p0, p1, p3, p4}; for (int x = 0; x < width; x += 16) { __m128i sumx[2] = {zero, zero}; __m128i sumy[2] = {zero, zero}; for (int i = 0; i < 4; i++) { __m128i xmm0, xmm1, xmul; xmul = _mm_load_si128((__m128i *)ar_mulx[i]); xmm0 = _mm_loadu_si128((__m128i *)(posh[i] + x)); xmm1 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); sumx[0] = _mm_add_epi16(sumx[0], _mm_mullo_epi16(xmm0, xmul)); sumx[1] = _mm_add_epi16(sumx[1], _mm_mullo_epi16(xmm1, xmul)); xmul = _mm_load_si128((__m128i *)ar_muly[i]); xmm0 = _mm_load_si128((__m128i *)(posv[i] + x)); xmm1 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); sumy[0] = _mm_add_epi16(sumy[0], _mm_mullo_epi16(xmm0, xmul)); sumy[1] = _mm_add_epi16(sumy[1], _mm_mullo_epi16(xmm1, xmul)); } for (int i = 0; i < 2; i++) { __m128i xmax, xmin, mull, mulh; sumx[i] = mm_abs_epi16(sumx[i]); sumy[i] = mm_abs_epi16(sumy[i]); xmax = _mm_max_epi16(sumx[i], sumy[i]); xmin = _mm_min_epi16(sumx[i], sumy[i]); mull = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpacklo_epi16(xmax, zero)), 4); mulh = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpackhi_epi16(xmax, zero)), 4); xmax = mm_cast_epi32(mull, mulh); mull = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpacklo_epi16(xmin, zero)), 5); mulh = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpackhi_epi16(xmin, zero)), 5); xmin = mm_cast_epi32(mull, mulh); sumx[i] = _mm_adds_epu16(xmax, xmin); sumx[i] = _mm_srli_epi16(sumx[i], eh->rshift); } __m128i out = _mm_packus_epi16(sumx[0], sumx[1]); __m128i temp = _mm_min_epu8(out, max); temp = _mm_cmpeq_epi8(temp, max); out = _mm_or_si128(temp, out); temp = _mm_max_epu8(out, min); temp = _mm_cmpeq_epi8(temp, min); out = _mm_andnot_si128(temp, out); _mm_store_si128((__m128i*)(dstp + x), out); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
static void GF_FUNC_ALIGN VS_CC proc_16bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s, edge_t *eh, uint16_t plane_max) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t* p0 = (uint16_t *)buff + 8; uint16_t* p1 = p0 + bstride; uint16_t* p2 = p1 + bstride; uint16_t* p3 = p2 + bstride; uint16_t* p4 = p3 + bstride; uint16_t *orig = p0, *end = p4; line_copy16(p0, srcp + 2 * stride, width, 2); line_copy16(p1, srcp + stride, width, 2); line_copy16(p2, srcp, width, 2); srcp += stride; line_copy16(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128 alpha = _mm_set1_ps((float)0.96043387); __m128 beta = _mm_set1_ps((float)0.39782473); __m128i pmax = _mm_set1_epi32(0xFFFF); __m128i min = _mm_set1_epi16((int16_t)eh->min); __m128i max = _mm_set1_epi16((int16_t)eh->max); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy16(p4, srcp, width, 2); uint16_t* posh[] = {p2 - 2, p2 - 1, p2 + 1, p2 + 2}; uint16_t* posv[] = {p0, p1, p3, p4}; for (int x = 0; x < width; x += 8) { __m128 sumx[2] = {(__m128)zero, (__m128)zero}; __m128 sumy[2] = {(__m128)zero, (__m128)zero}; for (int i = 0; i < 4; i++) { __m128 xmul = _mm_load_ps(ar_mulxf[i]); __m128i xmm0 = _mm_loadu_si128((__m128i *)(posh[i] + x)); __m128i xmm1 = _mm_unpackhi_epi16(xmm0, zero); xmm0 = _mm_unpacklo_epi16(xmm0, zero); sumx[0] = _mm_add_ps(sumx[0], _mm_mul_ps(_mm_cvtepi32_ps(xmm0), xmul)); sumx[1] = _mm_add_ps(sumx[1], _mm_mul_ps(_mm_cvtepi32_ps(xmm1), xmul)); xmul = _mm_load_ps(ar_mulyf[i]); xmm0 = _mm_load_si128((__m128i *)(posv[i] + x)); xmm1 = _mm_unpackhi_epi16(xmm0, zero); xmm0 = _mm_unpacklo_epi16(xmm0, zero); sumy[0] = _mm_add_ps(sumy[0], _mm_mul_ps(_mm_cvtepi32_ps(xmm0), xmul)); sumy[1] = _mm_add_ps(sumy[1], _mm_mul_ps(_mm_cvtepi32_ps(xmm1), xmul)); } __m128i out[2]; for (int i = 0; i < 2; i++) { sumx[i] = mm_abs_ps(sumx[i]); sumy[i] = mm_abs_ps(sumy[i]); __m128 t0 = _mm_max_ps(sumx[i], sumy[i]); __m128 t1 = _mm_min_ps(sumx[i], sumy[i]); t0 = _mm_add_ps(_mm_mul_ps(alpha, t0), _mm_mul_ps(beta, t1)); out[i] = _mm_srli_epi32(_mm_cvtps_epi32(t0), eh->rshift); out[i] = mm_min_epi32(out[i], pmax); } out[0] = mm_cast_epi32(out[0], out[1]); out[1] = MM_MIN_EPU16(out[0], max); out[1] = _mm_cmpeq_epi16(out[1], max); out[0] = _mm_or_si128(out[1], out[0]); out[1] = MM_MAX_EPU16(out[0], min); out[1] = _mm_cmpeq_epi16(out[1], min); out[0] = _mm_andnot_si128(out[1], out[0]); _mm_store_si128((__m128i *)(dstp + x), out[0]); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
static inline uint16_t fm10k_recv_raw_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union fm10k_rx_desc *rxdp; struct rte_mbuf **mbufp; uint16_t nb_pkts_recd; int pos; struct fm10k_rx_queue *rxq = rx_queue; uint64_t var; __m128i shuf_msk; __m128i dd_check, eop_check; uint16_t next_dd; next_dd = rxq->next_dd; /* Just the act of getting into the function from the application is * going to cost about 7 cycles */ rxdp = rxq->hw_ring + next_dd; rte_prefetch0(rxdp); /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ if (rxq->rxrearm_nb > RTE_FM10K_RXQ_REARM_THRESH) fm10k_rxq_rearm(rxq); /* Before we start moving massive data around, check to see if * there is actually a packet available */ if (!(rxdp->d.staterr & FM10K_RXD_STATUS_DD)) return 0; /* Vecotr RX will process 4 packets at a time, strip the unaligned * tails in case it's not multiple of 4. */ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_FM10K_DESCS_PER_LOOP); /* 4 packets DD mask */ dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL); /* 4 packets EOP mask */ eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL); /* mask to shuffle from desc. to mbuf */ shuf_msk = _mm_set_epi8( 7, 6, 5, 4, /* octet 4~7, 32bits rss */ 15, 14, /* octet 14~15, low 16 bits vlan_macip */ 13, 12, /* octet 12~13, 16 bits data_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 13, 12, /* octet 12~13, low 16 bits pkt_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_type */ 0xFF, 0xFF /* Skip pkt_type field in shuffle operation */ ); /* * Compile-time verify the shuffle mask * NOTE: some field positions already verified above, but duplicated * here for completeness in case of future modifications. */ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) != offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12); /* Cache is empty -> need to scan the buffer rings, but first move * the next 'n' mbufs into the cache */ mbufp = &rxq->sw_ring[next_dd]; /* A. load 4 packet in one loop * [A*. mask out 4 unused dirty field in desc] * B. copy 4 mbuf point from swring to rx_pkts * C. calc the number of DD bits among the 4 packets * [C*. extract the end-of-packet bit, if requested] * D. fill info. from desc to mbuf */ for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; pos += RTE_FM10K_DESCS_PER_LOOP, rxdp += RTE_FM10K_DESCS_PER_LOOP) { __m128i descs0[RTE_FM10K_DESCS_PER_LOOP]; __m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; __m128i zero, staterr, sterr_tmp1, sterr_tmp2; __m128i mbp1; /* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */ #if defined(RTE_ARCH_X86_64) __m128i mbp2; #endif /* B.1 load 2 (64 bit) or 4 (32 bit) mbuf points */ mbp1 = _mm_loadu_si128((__m128i *)&mbufp[pos]); /* Read desc statuses backwards to avoid race condition */ /* A.1 load 4 pkts desc */ descs0[3] = _mm_loadu_si128((__m128i *)(rxdp + 3)); rte_compiler_barrier(); /* B.2 copy 2 64 bit or 4 32 bit mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); #if defined(RTE_ARCH_X86_64) /* B.1 load 2 64 bit mbuf poitns */ mbp2 = _mm_loadu_si128((__m128i *)&mbufp[pos+2]); #endif descs0[2] = _mm_loadu_si128((__m128i *)(rxdp + 2)); rte_compiler_barrier(); /* B.1 load 2 mbuf point */ descs0[1] = _mm_loadu_si128((__m128i *)(rxdp + 1)); rte_compiler_barrier(); descs0[0] = _mm_loadu_si128((__m128i *)(rxdp)); #if defined(RTE_ARCH_X86_64) /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2); #endif /* avoid compiler reorder optimization */ rte_compiler_barrier(); if (split_packet) { rte_mbuf_prefetch_part2(rx_pkts[pos]); rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); } /* D.1 pkt 3,4 convert format from desc to pktmbuf */ pkt_mb4 = _mm_shuffle_epi8(descs0[3], shuf_msk); pkt_mb3 = _mm_shuffle_epi8(descs0[2], shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = _mm_unpackhi_epi32(descs0[3], descs0[2]); /* C.1 4=>2 filter staterr info only */ sterr_tmp1 = _mm_unpackhi_epi32(descs0[1], descs0[0]); /* set ol_flags with vlan packet type */ fm10k_desc_to_olflags_v(descs0, &rx_pkts[pos]); /* D.1 pkt 1,2 convert format from desc to pktmbuf */ pkt_mb2 = _mm_shuffle_epi8(descs0[1], shuf_msk); pkt_mb1 = _mm_shuffle_epi8(descs0[0], shuf_msk); /* C.2 get 4 pkts staterr value */ zero = _mm_xor_si128(dd_check, dd_check); staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); /* D.3 copy final 3,4 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1, pkt_mb4); _mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1, pkt_mb3); /* C* extract and record EOP bit */ if (split_packet) { __m128i eop_shuf_mask = _mm_set_epi8( 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x04, 0x0C, 0x00, 0x08 ); /* and with mask to extract bits, flipping 1-0 */ __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); /* the staterr values are not in order, as the count * count of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); /* store the resulting 32-bit value */ *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); split_packet += RTE_FM10K_DESCS_PER_LOOP; /* zero-out next pointers */ rx_pkts[pos]->next = NULL; rx_pkts[pos + 1]->next = NULL; rx_pkts[pos + 2]->next = NULL; rx_pkts[pos + 3]->next = NULL; } /* C.3 calc available number of desc */ staterr = _mm_and_si128(staterr, dd_check); staterr = _mm_packs_epi32(staterr, zero); /* D.3 copy final 1,2 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1, pkt_mb2); _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1); fm10k_desc_to_pktype_v(descs0, &rx_pkts[pos]); /* C.4 calc avaialbe number of desc */ var = __builtin_popcountll(_mm_cvtsi128_si64(staterr)); nb_pkts_recd += var; if (likely(var != RTE_FM10K_DESCS_PER_LOOP)) break; } /* Update our internal tail pointer */ rxq->next_dd = (uint16_t)(rxq->next_dd + nb_pkts_recd); rxq->next_dd = (uint16_t)(rxq->next_dd & (rxq->nb_desc - 1)); rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); return nb_pkts_recd; }
pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi) { int lastRow, lastCol; BYTE *UData,*VData,*YData; int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV; __m128i r0,r1,r2,r3,r4,r5,r6,r7; __m128i *buffer; /* last_line: if the last (U,V doubled) line should be skipped, set to 10B * last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */ buffer = _aligned_malloc(4 * 16, 16); YData = (BYTE*) pSrc[0]; UData = (BYTE*) pSrc[1]; VData = (BYTE*) pSrc[2]; nWidth = roi->width; nHeight = roi->height; if ((lastCol = (nWidth & 3))) { switch (lastCol) { case 1: r7 = _mm_set_epi32(0,0,0,0xFFFFFFFF); break; case 2: r7 = _mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF); break; case 3: r7 = _mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF); break; } _mm_store_si128(buffer+3,r7); lastCol = 1; } nWidth += 3; nWidth = nWidth >> 2; lastRow = nHeight & 1; nHeight++; nHeight = nHeight >> 1; VaddDst = (dstStep << 1) - (nWidth << 4); VaddY = (srcStep[0] << 1) - (nWidth << 2); VaddU = srcStep[1] - (((nWidth << 1) + 2) & 0xFFFC); VaddV = srcStep[2] - (((nWidth << 1) + 2) & 0xFFFC); while (nHeight-- > 0) { if (nHeight == 0) lastRow <<= 1; i = 0; do { if (!(i & 0x01)) { /* Y-, U- and V-data is stored in different arrays. * We start with processing U-data. * * at first we fetch four U-values from its array and shuffle them like this: * 0d0d 0c0c 0b0b 0a0a * we've done two things: converting the values to signed words and duplicating * each value, because always two pixel "share" the same U- (and V-) data */ r0 = _mm_cvtsi32_si128(*(UINT32 *)UData); r5 = _mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000); r0 = _mm_shuffle_epi8(r0,r5); UData += 4; /* then we subtract 128 from each value, so we get D */ r3 = _mm_set_epi16(128,128,128,128,128,128,128,128); r0 = _mm_subs_epi16(r0,r3); /* we need to do two things with our D, so let's store it for later use */ r2 = r0; /* now we can multiply our D with 48 and unpack it to xmm4:xmm0 * this is what we need to get G data later on */ r4 = r0; r7 = _mm_set_epi16(48,48,48,48,48,48,48,48); r0 = _mm_mullo_epi16(r0,r7); r4 = _mm_mulhi_epi16(r4,r7); r7 = r0; r0 = _mm_unpacklo_epi16(r0,r4); r4 = _mm_unpackhi_epi16(r7,r4); /* to get B data, we need to prepare a second value, D*475 */ r1 = r2; r7 = _mm_set_epi16(475,475,475,475,475,475,475,475); r1 = _mm_mullo_epi16(r1,r7); r2 = _mm_mulhi_epi16(r2,r7); r7 = r1; r1 = _mm_unpacklo_epi16(r1,r2); r7 = _mm_unpackhi_epi16(r7,r2); /* so we got something like this: xmm7:xmm1 * this pair contains values for 16 pixel: * aabbccdd * aabbccdd, but we can only work on four pixel at once, so we need to save upper values */ _mm_store_si128(buffer+1,r7); /* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */ r2 = _mm_cvtsi32_si128(*(UINT32 *)VData); r2 = _mm_shuffle_epi8(r2,r5); VData += 4; r2 = _mm_subs_epi16(r2,r3); r5 = r2; /* this is also known as E*403, we need it to convert R data */ r3 = r2; r7 = _mm_set_epi16(403,403,403,403,403,403,403,403); r2 = _mm_mullo_epi16(r2,r7); r3 = _mm_mulhi_epi16(r3,r7); r7 = r2; r2 = _mm_unpacklo_epi16(r2,r3); r7 = _mm_unpackhi_epi16(r7,r3); /* and preserve upper four values for future ... */ _mm_store_si128(buffer+2,r7); /* doing this step: E*120 */ r3 = r5; r7 = _mm_set_epi16(120,120,120,120,120,120,120,120); r3 = _mm_mullo_epi16(r3,r7); r5 = _mm_mulhi_epi16(r5,r7); r7 = r3; r3 = _mm_unpacklo_epi16(r3,r5); r7 = _mm_unpackhi_epi16(r7,r5); /* now we complete what we've begun above: * (48*D) + (120*E) = (48*D +120*E) */ r0 = _mm_add_epi32(r0,r3); r4 = _mm_add_epi32(r4,r7); /* and store to memory ! */ _mm_store_si128(buffer,r4); } else { /* maybe you've wondered about the conditional above ? * Well, we prepared UV data for eight pixel in each line, but can only process four * per loop. So we need to load the upper four pixel data from memory each secound loop! */ r1 = _mm_load_si128(buffer+1); r2 = _mm_load_si128(buffer+2); r0 = _mm_load_si128(buffer); } if (++i == nWidth) lastCol <<= 1; /* We didn't produce any output yet, so let's do so! * Ok, fetch four pixel from the Y-data array and shuffle them like this: * 00d0 00c0 00b0 00a0, to get signed dwords and multiply by 256 */ r4 = _mm_cvtsi32_si128(*(UINT32 *)YData); r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); r4 = _mm_shuffle_epi8(r4,r7); r5 = r4; r6 = r4; /* no we can perform the "real" conversion itself and produce output! */ r4 = _mm_add_epi32(r4,r2); r5 = _mm_sub_epi32(r5,r0); r6 = _mm_add_epi32(r6,r1); /* in the end, we only need bytes for RGB values. * So, what do we do? right! shifting left makes values bigger and thats always good. * before we had dwords of data, and by shifting left and treating the result * as packed words, we get not only signed words, but do also divide by 256 * imagine, data is now ordered this way: ddx0 ccx0 bbx0 aax0, and x is the least * significant byte, that we don't need anymore, because we've done some rounding */ r4 = _mm_slli_epi32(r4,8); r5 = _mm_slli_epi32(r5,8); r6 = _mm_slli_epi32(r6,8); /* one thing we still have to face is the clip() function ... * we have still signed words, and there are those min/max instructions in SSE2 ... * the max instruction takes always the bigger of the two operands and stores it in the first one, * and it operates with signs ! * if we feed it with our values and zeros, it takes the zeros if our values are smaller than * zero and otherwise our values */ r7 = _mm_set_epi32(0,0,0,0); r4 = _mm_max_epi16(r4,r7); r5 = _mm_max_epi16(r5,r7); r6 = _mm_max_epi16(r6,r7); /* the same thing just completely different can be used to limit our values to 255, * but now using the min instruction and 255s */ r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_min_epi16(r4,r7); r5 = _mm_min_epi16(r5,r7); r6 = _mm_min_epi16(r6,r7); /* Now we got our bytes. * the moment has come to assemble the three channels R,G and B to the xrgb dwords * on Red channel we just have to and each futural dword with 00FF0000H */ //r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_and_si128(r4,r7); /* on Green channel we have to shuffle somehow, so we get something like this: * 00d0 00c0 00b0 00a0 */ r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); r5 = _mm_shuffle_epi8(r5,r7); /* and on Blue channel that one: * 000d 000c 000b 000a */ r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); r6 = _mm_shuffle_epi8(r6,r7); /* and at last we or it together and get this one: * xrgb xrgb xrgb xrgb */ r4 = _mm_or_si128(r4,r5); r4 = _mm_or_si128(r4,r6); /* Only thing to do know is writing data to memory, but this gets a bit more * complicated if the width is not a multiple of four and it is the last column in line. */ if (lastCol & 0x02) { /* let's say, we need to only convert six pixel in width * Ok, the first 4 pixel will be converted just like every 4 pixel else, but * if it's the last loop in line, last_column is shifted left by one (curious? have a look above), * and we land here. Through initialisation a mask was prepared. In this case it looks like * 0000FFFFH 0000FFFFH 0000FFFFH 0000FFFFH */ r6 = _mm_load_si128(buffer+3); /* we and our output data with this mask to get only the valid pixel */ r4 = _mm_and_si128(r4,r6); /* then we fetch memory from the destination array ... */ r5 = _mm_lddqu_si128((__m128i *)pDst); /* ... and and it with the inverse mask. We get only those pixel, which should not be updated */ r6 = _mm_andnot_si128(r6,r5); /* we only have to or the two values together and write it back to the destination array, * and only the pixel that should be updated really get changed. */ r4 = _mm_or_si128(r4,r6); } _mm_storeu_si128((__m128i *)pDst,r4); if (!(lastRow & 0x02)) { /* Because UV data is the same for two lines, we can process the secound line just here, * in the same loop. Only thing we need to do is to add some offsets to the Y- and destination * pointer. These offsets are iStride[0] and the target scanline. * But if we don't need to process the secound line, like if we are in the last line of processing nine lines, * we just skip all this. */ r4 = _mm_cvtsi32_si128(*(UINT32 *)(YData+srcStep[0])); r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); r4 = _mm_shuffle_epi8(r4,r7); r5 = r4; r6 = r4; r4 = _mm_add_epi32(r4,r2); r5 = _mm_sub_epi32(r5,r0); r6 = _mm_add_epi32(r6,r1); r4 = _mm_slli_epi32(r4,8); r5 = _mm_slli_epi32(r5,8); r6 = _mm_slli_epi32(r6,8); r7 = _mm_set_epi32(0,0,0,0); r4 = _mm_max_epi16(r4,r7); r5 = _mm_max_epi16(r5,r7); r6 = _mm_max_epi16(r6,r7); r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_min_epi16(r4,r7); r5 = _mm_min_epi16(r5,r7); r6 = _mm_min_epi16(r6,r7); r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_and_si128(r4,r7); r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); r5 = _mm_shuffle_epi8(r5,r7); r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); r6 = _mm_shuffle_epi8(r6,r7); r4 = _mm_or_si128(r4,r5); r4 = _mm_or_si128(r4,r6); if (lastCol & 0x02) { r6 = _mm_load_si128(buffer+3); r4 = _mm_and_si128(r4,r6); r5 = _mm_lddqu_si128((__m128i *)(pDst+dstStep)); r6 = _mm_andnot_si128(r6,r5); r4 = _mm_or_si128(r4,r6); /* only thing is, we should shift [rbp-42] back here, because we have processed the last column, * and this "special condition" can be released */ lastCol >>= 1; } _mm_storeu_si128((__m128i *)(pDst+dstStep),r4); } /* after all we have to increase the destination- and Y-data pointer by four pixel */ pDst += 16; YData += 4; }
PRBool gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf, const gfxImageSurface* whiteSurf) { gfxIntSize size = blackSurf->GetSize(); if (size != whiteSurf->GetSize() || (blackSurf->Format() != gfxASurface::ImageFormatARGB32 && blackSurf->Format() != gfxASurface::ImageFormatRGB24) || (whiteSurf->Format() != gfxASurface::ImageFormatARGB32 && whiteSurf->Format() != gfxASurface::ImageFormatRGB24)) return PR_FALSE; blackSurf->Flush(); whiteSurf->Flush(); unsigned char* blackData = blackSurf->Data(); unsigned char* whiteData = whiteSurf->Data(); if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) || (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) { // Cannot keep these in alignment. return PR_FALSE; } __m128i greenMask = _mm_load_si128((__m128i*)greenMaski); __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski); for (PRInt32 i = 0; i < size.height; ++i) { PRInt32 j = 0; // Loop single pixels until at 4 byte alignment. while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) { *((PRUint32*)blackData) = RecoverPixel(*reinterpret_cast<PRUint32*>(blackData), *reinterpret_cast<PRUint32*>(whiteData)); blackData += 4; whiteData += 4; j++; } // This extra loop allows the compiler to do some more clever registry // management and makes it about 5% faster than with only the 4 pixel // at a time loop. for (; j < size.width - 8; j += 8) { __m128i black1 = _mm_load_si128((__m128i*)blackData); __m128i white1 = _mm_load_si128((__m128i*)whiteData); __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16)); __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16)); // Execute the same instructions as described in RecoverPixel, only // using an SSE2 packed saturated subtract. white1 = _mm_subs_epu8(white1, black1); white2 = _mm_subs_epu8(white2, black2); white1 = _mm_subs_epu8(greenMask, white1); white2 = _mm_subs_epu8(greenMask, white2); // Producing the final black pixel in an XMM register and storing // that is actually faster than doing a masked store since that // does an unaligned storage. We have the black pixel in a register // anyway. black1 = _mm_andnot_si128(alphaMask, black1); black2 = _mm_andnot_si128(alphaMask, black2); white1 = _mm_slli_si128(white1, 2); white2 = _mm_slli_si128(white2, 2); white1 = _mm_and_si128(alphaMask, white1); white2 = _mm_and_si128(alphaMask, white2); black1 = _mm_or_si128(white1, black1); black2 = _mm_or_si128(white2, black2); _mm_store_si128((__m128i*)blackData, black1); _mm_store_si128((__m128i*)(blackData + 16), black2); blackData += 32; whiteData += 32; } for (; j < size.width - 4; j += 4) { __m128i black = _mm_load_si128((__m128i*)blackData); __m128i white = _mm_load_si128((__m128i*)whiteData); white = _mm_subs_epu8(white, black); white = _mm_subs_epu8(greenMask, white); black = _mm_andnot_si128(alphaMask, black); white = _mm_slli_si128(white, 2); white = _mm_and_si128(alphaMask, white); black = _mm_or_si128(white, black); _mm_store_si128((__m128i*)blackData, black); blackData += 16; whiteData += 16; } // Loop single pixels until we're done. while (j < size.width) { *((PRUint32*)blackData) = RecoverPixel(*reinterpret_cast<PRUint32*>(blackData), *reinterpret_cast<PRUint32*>(whiteData)); blackData += 4; whiteData += 4; j++; } blackData += blackSurf->Stride() - j * 4; whiteData += whiteSurf->Stride() - j * 4; } blackSurf->MarkDirty(); return PR_TRUE; }
static void thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) { int i, j; Size roi = _src.size(); roi.width *= _src.channels(); const short* src = (const short*)_src.data; short* dst = (short*)_dst.data; size_t src_step = _src.step/sizeof(src[0]); size_t dst_step = _dst.step/sizeof(dst[0]); #if CV_SSE2 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE); #endif if( _src.isContinuous() && _dst.isContinuous() ) { roi.width *= roi.height; roi.height = 1; src_step = dst_step = roi.width; } #ifdef HAVE_TEGRA_OPTIMIZATION if (tegra::thresh_16s(_src, _dst, roi.width, roi.height, thresh, maxval, type)) return; #endif #if defined(HAVE_IPP) IppiSize sz = { roi.width, roi.height }; switch( type ) { case THRESH_TRUNC: if (0 <= ippiThreshold_GT_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh)) return; setIppErrorStatus(); break; case THRESH_TOZERO: if (0 <= ippiThreshold_LTVal_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+1, 0)) return; setIppErrorStatus(); break; case THRESH_TOZERO_INV: if (0 <= ippiThreshold_GTVal_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0)) return; setIppErrorStatus(); break; } #endif switch( type ) { case THRESH_BINARY: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_cmpgt_epi16( v0, thresh8 ); v1 = _mm_cmpgt_epi16( v1, thresh8 ); v0 = _mm_and_si128( v0, maxval8 ); v1 = _mm_and_si128( v1, maxval8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] > thresh ? maxval : 0; } break; case THRESH_BINARY_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_cmpgt_epi16( v0, thresh8 ); v1 = _mm_cmpgt_epi16( v1, thresh8 ); v0 = _mm_andnot_si128( v0, maxval8 ); v1 = _mm_andnot_si128( v1, maxval8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] <= thresh ? maxval : 0; } break; case THRESH_TRUNC: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_min_epi16( v0, thresh8 ); v1 = _mm_min_epi16( v1, thresh8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #endif for( ; j < roi.width; j++ ) dst[j] = std::min(src[j], thresh); } break; case THRESH_TOZERO: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_and_si128(v0, _mm_cmpgt_epi16(v0, thresh8)); v1 = _mm_and_si128(v1, _mm_cmpgt_epi16(v1, thresh8)); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #endif for( ; j < roi.width; j++ ) { short v = src[j]; dst[j] = v > thresh ? v : 0; } } break; case THRESH_TOZERO_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_andnot_si128(_mm_cmpgt_epi16(v0, thresh8), v0); v1 = _mm_andnot_si128(_mm_cmpgt_epi16(v1, thresh8), v1); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #endif for( ; j < roi.width; j++ ) { short v = src[j]; dst[j] = v <= thresh ? v : 0; } } break; default: return CV_Error( CV_StsBadArg, "" ); } }
int camCompareDescriptors(const int *desc1, const int *desc2, const int s) { int i, j, distance = 0; __m128i sum, d1, d2, md, d, cmp; __m128i *p1 = (__m128i*)desc1, *p2 = (__m128i*)desc2; ALIGN(int out_sse[4], 16); /* Looks like a good idea... But this deteriorates performance... // Software prefetch d1 = _mm_load_si128(p1); d2 = _mm_load_si128(p2); for (i = 0; i != s; i += 32) { _mm_prefetch(&desc1[i], _MM_HINT_NTA); _mm_prefetch(&desc2[i], _MM_HINT_NTA); } */ sum = _mm_setzero_si128(); for (i = 0; i != s >> 4; i++) { // 32-bits SAD for 4 integers in parallel d1 = _mm_loadu_si128(p1++); d2 = _mm_loadu_si128(p2++); d = _mm_sub_epi32(d1, d2); md = _mm_sub_epi32(d2, d1); cmp = _mm_cmplt_epi32(d, _mm_setzero_si128()); md = _mm_and_si128(cmp, md); d = _mm_andnot_si128(cmp, d); sum = _mm_add_epi32(sum, md); sum = _mm_add_epi32(sum, d); // 32-bits SAD for 4 integers in parallel d1 = _mm_loadu_si128(p1++); d2 = _mm_loadu_si128(p2++); d = _mm_sub_epi32(d1, d2); md = _mm_sub_epi32(d2, d1); cmp = _mm_cmplt_epi32(d, _mm_setzero_si128()); md = _mm_and_si128(cmp, md); d = _mm_andnot_si128(cmp, d); sum = _mm_add_epi32(sum, md); sum = _mm_add_epi32(sum, d); // 32-bits SAD for 4 integers in parallel d1 = _mm_loadu_si128(p1++); d2 = _mm_loadu_si128(p2++); d = _mm_sub_epi32(d1, d2); md = _mm_sub_epi32(d2, d1); cmp = _mm_cmplt_epi32(d, _mm_setzero_si128()); md = _mm_and_si128(cmp, md); d = _mm_andnot_si128(cmp, d); sum = _mm_add_epi32(sum, md); sum = _mm_add_epi32(sum, d); // 32-bits SAD for 4 integers in parallel d1 = _mm_loadu_si128(p1++); d2 = _mm_loadu_si128(p2++); d = _mm_sub_epi32(d1, d2); md = _mm_sub_epi32(d2, d1); cmp = _mm_cmplt_epi32(d, _mm_setzero_si128()); md = _mm_and_si128(cmp, md); d = _mm_andnot_si128(cmp, d); sum = _mm_add_epi32(sum, md); sum = _mm_add_epi32(sum, d); } _mm_store_si128((__m128i*)out_sse, sum); return out_sse[0] + out_sse[1] + out_sse[2] + out_sse[3]; }
__m128i t2(void) { return _mm_andnot_si128 (magic_a,magic_b); }
//void xfft(int gatepow[NGATEAUTO][NFAUTO],char *obuf, char *ibuf, __m128i norm[NCHAN/2][FFTLEN/8],short int idelay[4],int nthread,double time0, double period) void xfft(char *obuf, char *ibuf, __m128i norm[NCHAN/2][FFTLEN/8],short int idelay[4],int nthread,char walshsign[NCHAN][NWALSH2]) { int nbuf,k; __declspec(align(128)) static short int fftbuf[4][NCHAN][FFTLEN]; // cache align this nbuf=NT/FFTLEN/NCHAN/2; // the last factor of two because half the data is sent off to quad cores omp_set_num_threads(nthread); #pragma omp parallel for default(none) shared(obuf,ibuf,norm,nbuf,fftbuf,idelay,walshsign) schedule(dynamic,64) for (k=0;k<nbuf-1;k++){ int i,j,r32,i32,io,imp; short int i16,r16,igate,*ibuf16; register __m128i r0,r1,r2,r3,r4,r5,r6,r7; __m128i *fftbuf_sse; #ifdef _OPENMP imp=omp_get_thread_num(); #else imp=0; #endif /* we want fftbuf to stay in cache */ for (j=0;j<NCHAN;j++) { for(i=0;i<FFTLEN;i++) { char ctmp,ctmp1; ctmp=ibuf[(k*FFTLEN+(i+idelay[j]))*NCHAN+j]; // ctmp1=(ctmp & 0b10111111) | (ctmp >> 1 & 0b0100000); // clip fftbuf[imp][j][i]=ctmp*walshsign[j][k/NTCHUNK]; } fft1dippirc16(fftbuf[imp][j],fftbuf[imp][j],1); fftbuf_sse=fftbuf[imp][j]; for(i=0;i<FFTLEN/8;i++) fftbuf_sse[i]=_mm_mulhi_epi16(fftbuf_sse[i],norm[j][i]); r7=_mm_set1_epi8(0xf0); for (i=0;i<FFTLEN/2;i+=FFTBLOCK){ #if 0 for (io=0;io<FFTBLOCK;io++){ // we process 2 numbers at once. r32=fftbuf[imp][j][2*(i+io)]; // r32=r32*norm[j][i+io]; i32=fftbuf[imp][j][2*(i+io)+1]; // i32*=norm[j][i+io]; obuf[io+j*FFTBLOCK+k*FFTBLOCK*NCHAN+i*(NT/(FFTLEN)/2)]=(r32 >> 16)&0x0f | (i32 >> 12)&0xf0; } #else for (io=0;io<FFTBLOCK;io+=2*8){ // we process 32 numbers at once. /* bits 5-8 are extracted(?) */ r0=_mm_load_si128(&fftbuf[imp][j][2*(i+io)]); r1=_mm_load_si128(&fftbuf[imp][j][2*(i+io)+8]); r2=_mm_load_si128(&fftbuf[imp][j][2*(i+io)+16]); r3=_mm_load_si128(&fftbuf[imp][j][2*(i+io)+24]); // squeeze four 16-bit ints into 4-bit ints #define MMSHUF _MM_SHUFFLE(3,1,2,0) // little endian, swap i1 r1 i0 r0 -> i1 i0 r1 r0 r5=_mm_shufflehi_epi16(r0,MMSHUF); r6=_mm_shufflelo_epi16(r5,MMSHUF); r0=_mm_shuffle_epi32(r6,MMSHUF); // i3 i2 r3 r2 i1 i0 r1 r0 -> i3210 r3210 r5=_mm_shufflehi_epi16(r1,MMSHUF); r6=_mm_shufflelo_epi16(r5,MMSHUF); r1=_mm_shuffle_epi32(r6,MMSHUF); r5=_mm_unpacklo_epi64(r0,r1); // r0=i3210r3210, r1=i7654r7654 -> r5=r76543210 r6=_mm_unpackhi_epi64(r0,r1); // r6=i76543210 r0=r5; r1=r6; // now for the second set r5=_mm_shufflehi_epi16(r2,MMSHUF); r6=_mm_shufflelo_epi16(r5,MMSHUF); r2=_mm_shuffle_epi32(r6,MMSHUF); r5=_mm_shufflehi_epi16(r3,MMSHUF); r6=_mm_shufflelo_epi16(r5,MMSHUF); r3=_mm_shuffle_epi32(r6,MMSHUF); r5=_mm_unpacklo_epi64(r2,r3); r6=_mm_unpackhi_epi64(r2,r3); r2=r5; // r5 is the real part r3=r6; /* this part reduces the number of bits to LSB with saturate */ r5=_mm_packs_epi16(r0,r2); // r5=rFEDCBA9876543210, saturate r0=_mm_srli_epi16(r5,4); // in little-endian, real into LSB // modified next few lines to just store MSB's. r0=_mm_andnot_si128(r7,r0);//zero 4 MSB r6=_mm_packs_epi16(r1,r3); // imaginary r1=_mm_and_si128(r6,r7); r2=_mm_or_si128(r0,r1); /* write without polluting caches */ _mm_stream_si128(&obuf[io+j*FFTBLOCK+k*FFTBLOCK*NCHAN+i*(NT/FFTLEN/2)],r2); /* the outgoing structure is obuf[FFTREST][TIME][CHAN][FFTBLOCK]. The BLOCK is cache friendly, the FFTREST is the MPI transpose order, and we need all channels locally for the correlation. */ } #endif // prefetch obuf non-persistent } } } }
void fb_slvn_low(dig_t *c, const dig_t *a) { int i; dig_t *p, u0, u1, u2, u3; void *tab = fb_poly_get_slv(); __m128i m0, m1, m2, m3, m4, sqrt0, sqrt1, mask0, mask1, mask2, r0, r1, t0, t1, perm; perm = _mm_set_epi32(0x0F0D0B09, 0x07050301, 0x0E0C0A08, 0x06040200); mask2 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000); mask1 = _mm_set_epi32(0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0); mask0 = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F); sqrt0 = _mm_set_epi32(0x03020302, 0x01000100, 0x03020302, 0x01000100); sqrt1 = _mm_set_epi32(0x0c080c08, 0x04000400, 0x0c080c08, 0x04000400); t0 = _mm_load_si128((__m128i *)a); t1 = _mm_load_si128((__m128i *)(a + 2)); r0 = r1 = _mm_setzero_si128(); m0 = _mm_shuffle_epi8(t1, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_slli_si128(m1, 8); m1 = _mm_and_si128(m1, mask2); m1 = _mm_slli_epi64(m1, 4); m1 = _mm_xor_si128(m1, m2); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m0 = _mm_and_si128(t0, mask2); m0 = _mm_shuffle_epi8(m0, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_srli_si128(m1, 8); m1 = _mm_andnot_si128(mask2, m1); m2 = _mm_slli_epi64(m2, 4); m1 = _mm_xor_si128(m1, m2); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_si128(t0, 4); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0xFFFFFFFF)); m0 = _mm_shuffle_epi8(m1, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_slli_si128(m1, 8); m1 = _mm_slli_epi64(m1, 4); m1 = _mm_xor_si128(m1, m2); m1 = _mm_srli_si128(m1, 6); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_si128(t0, 2); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0xFFFF)); m0 = _mm_shuffle_epi8(m1, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_slli_si128(m1, 8); m1 = _mm_slli_epi64(m1, 4); m1 = _mm_xor_si128(m1, m2); m1 = _mm_srli_si128(m1, 7); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_si128(t0, 1); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x55)); m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 1)); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x33)); m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 2)); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x0F)); m1 = _mm_slli_epi64(m1, 4); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_epi64(t0, 4); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x5)); m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 1)); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x3)); m1 = _mm_slli_epi64(m1, 2); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_epi64(t0, 2); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x1)); m1 = _mm_slli_epi64(m1, 1); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); sqrt0 = _mm_set_epi32(0x03030202, 0x03030202, 0x01010000, 0x01010000); sqrt1 = _mm_set_epi32(0x0C0C0808, 0x0C0C0808, 0x04040000, 0x04040000); m1 = _mm_and_si128(t0, mask0); m2 = _mm_and_si128(t0, mask1); m3 = _mm_and_si128(t1, mask0); m4 = _mm_and_si128(t1, mask1); m2 = _mm_srli_epi64(m2, 4); m4 = _mm_srli_epi64(m4, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m4 = _mm_shuffle_epi8(sqrt1, m4); m3 = _mm_shuffle_epi8(sqrt0, m3); m1 = _mm_or_si128(m1, m2); m3 = _mm_or_si128(m3, m4); #ifndef __PCLMUL__ align dig_t x[2]; _mm_store_si128((__m128i *)x, m1); u0 = x[0]; u1 = x[1]; _mm_store_si128((__m128i *)x, m3); u2 = x[0]; u3 = x[1]; #else u0 = _mm_extract_epi64(m1, 0); u1 = _mm_extract_epi64(m1, 1); u2 = _mm_extract_epi64(m3, 0); u3 = _mm_extract_epi64(m3, 1); #endif for (i = 0; i < 8; i++) { p = (dig_t *)(tab + (16 * i + (u0 & 0x0F)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u0 >>= 8; p = (dig_t *)(tab + (16 * (i + 8) + (u1 & 0x0F)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u1 >>= 8; p = (dig_t *)(tab + (16 * (i + 16) + (u2 & 0x0F)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u2 >>= 8; p = (dig_t *)(tab + (16 * (i + 24) + (u3 & 0xF)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u3 >>= 8; } _mm_store_si128((__m128i *)c, r0); _mm_store_si128((__m128i *)(c + 2), r1); }
static void thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) { int i, j; Size roi = _src.size(); roi.width *= _src.channels(); const short* src = _src.ptr<short>(); short* dst = _dst.ptr<short>(); size_t src_step = _src.step/sizeof(src[0]); size_t dst_step = _dst.step/sizeof(dst[0]); #if CV_SSE2 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE); #endif if( _src.isContinuous() && _dst.isContinuous() ) { roi.width *= roi.height; roi.height = 1; src_step = dst_step = roi.width; } #ifdef HAVE_TEGRA_OPTIMIZATION if (tegra::thresh_16s(_src, _dst, roi.width, roi.height, thresh, maxval, type)) return; #endif #if defined(HAVE_IPP) CV_IPP_CHECK() { IppiSize sz = { roi.width, roi.height }; CV_SUPPRESS_DEPRECATED_START switch( type ) { case THRESH_TRUNC: #ifndef HAVE_IPP_ICV_ONLY if (_src.data == _dst.data && ippiThreshold_GT_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } #endif if (ippiThreshold_GT_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; case THRESH_TOZERO: #ifndef HAVE_IPP_ICV_ONLY if (_src.data == _dst.data && ippiThreshold_LTVal_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh + 1, 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } #endif if (ippiThreshold_LTVal_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+1, 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; case THRESH_TOZERO_INV: #ifndef HAVE_IPP_ICV_ONLY if (_src.data == _dst.data && ippiThreshold_GTVal_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } #endif if (ippiThreshold_GTVal_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; } CV_SUPPRESS_DEPRECATED_END } #endif switch( type ) { case THRESH_BINARY: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_cmpgt_epi16( v0, thresh8 ); v1 = _mm_cmpgt_epi16( v1, thresh8 ); v0 = _mm_and_si128( v0, maxval8 ); v1 = _mm_and_si128( v1, maxval8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval); for( ; j <= roi.width - 8; j += 8 ) { uint16x8_t v_mask = vcgtq_s16(vld1q_s16(src + j), v_thresh); vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval)); } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] > thresh ? maxval : 0; } break; case THRESH_BINARY_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_cmpgt_epi16( v0, thresh8 ); v1 = _mm_cmpgt_epi16( v1, thresh8 ); v0 = _mm_andnot_si128( v0, maxval8 ); v1 = _mm_andnot_si128( v1, maxval8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval); for( ; j <= roi.width - 8; j += 8 ) { uint16x8_t v_mask = vcleq_s16(vld1q_s16(src + j), v_thresh); vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval)); } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] <= thresh ? maxval : 0; } break; case THRESH_TRUNC: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_min_epi16( v0, thresh8 ); v1 = _mm_min_epi16( v1, thresh8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh); for( ; j <= roi.width - 8; j += 8 ) vst1q_s16(dst + j, vminq_s16(vld1q_s16(src + j), v_thresh)); #endif for( ; j < roi.width; j++ ) dst[j] = std::min(src[j], thresh); } break; case THRESH_TOZERO: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_and_si128(v0, _mm_cmpgt_epi16(v0, thresh8)); v1 = _mm_and_si128(v1, _mm_cmpgt_epi16(v1, thresh8)); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh); for( ; j <= roi.width - 8; j += 8 ) { int16x8_t v_src = vld1q_s16(src + j); uint16x8_t v_mask = vcgtq_s16(v_src, v_thresh); vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src)); } #endif for( ; j < roi.width; j++ ) { short v = src[j]; dst[j] = v > thresh ? v : 0; } } break; case THRESH_TOZERO_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_andnot_si128(_mm_cmpgt_epi16(v0, thresh8), v0); v1 = _mm_andnot_si128(_mm_cmpgt_epi16(v1, thresh8), v1); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh); for( ; j <= roi.width - 8; j += 8 ) { int16x8_t v_src = vld1q_s16(src + j); uint16x8_t v_mask = vcleq_s16(v_src, v_thresh); vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src)); } #endif for( ; j < roi.width; j++ ) { short v = src[j]; dst[j] = v <= thresh ? v : 0; } } break; default: return CV_Error( CV_StsBadArg, "" ); } }
static inline __m128i _mm_blendv_epi8_rpl(__m128i a, __m128i b, __m128i mask) { a = _mm_andnot_si128(mask, a); a = _mm_or_si128(a, _mm_and_si128(mask, b)); return a; }
static inline void image_insert_noov_8888_to_8888(image& target, int32_t x, int32_t y, const image& source) { for (uint32_t yy=0;yy<source.height;yy++) { uint32_t* __restrict targetpx = target.pixels32 + (y+yy)*target.stride/sizeof(uint32_t) + x; uint32_t* __restrict sourcepx = source.pixels32 + yy*source.stride/sizeof(uint32_t); //strangely enough, doing this slows things down. //if (!checksrcalpha && newalpha==-1) //{ // memcpy(targetpx, sourcepx, sizeof(uint32_t)*source.width); // continue; //} // TODO: enable AUTOVECTORIZE on -O3 - Gcc autovectorizes the post-SIMD loop... #if defined(__SSE2__) && !defined(AUTOVECTORIZE) //SIMD translation of the below //this particular loop is trivial to vectorize, but there's no vectorization on -Os //(in fact, on -O3, compiler vectorizes the post-SIMD loop that never has more than three iterations... grumble grumble...) size_t nsimd = 4; __m128i* __restrict targetpxw = (__m128i*)targetpx; __m128i* __restrict sourcepxw = (__m128i*)sourcepx; uint32_t xxew = source.width/nsimd; __m128i mask_or = (newalpha == 0xFF000000 ? _mm_set1_epi32(0xFF000000) : _mm_set1_epi32(0x00000000)); __m128i mask_and = (newalpha == 0x00000000 ? _mm_set1_epi32(0x00FFFFFF) : _mm_set1_epi32(0xFFFFFFFF)); //I could do a few non-SIMD iterations before that and use aligned instructions, // but intel intrinsics guide say they're same speed, so yawn for (uint32_t xx=0;xx<xxew;xx++) { __m128i px = _mm_loadu_si128(&sourcepxw[xx]); //copy sign bit to everywhere __m128i mask_local = _mm_srai_epi32(px, 31); px = _mm_and_si128(mask_and, _mm_or_si128(mask_or, px)); if (checksrcalpha) { __m128i tpx = _mm_loadu_si128(&targetpxw[xx]); //if mask_local bit is set, copy from sp, otherwise from tp //this is AVX2 _mm_maskstore_epi32, but that's not available in SSE2 //but it's also easy to bithack (either with xor or andnot; latter gives shorter dependency chains) px = _mm_or_si128(_mm_and_si128(mask_local, px), _mm_andnot_si128(mask_local, tpx)); } _mm_storeu_si128(&targetpxw[xx], px); } #else //the one-pixel loop is needed to handle the last few pixels without overflow //if there's no SIMD, just run it for everything size_t xxew = 0; size_t nsimd = 0; #endif for (uint32_t xx=xxew*nsimd;xx<source.width;xx++) { uint32_t spx = sourcepx[xx]; uint32_t tpx = targetpx[xx]; if (!checksrcalpha || (spx&0x80000000)) // for bargb, check sign only, it's the cheapest { if (newalpha == 0xFF000000 && checksrcalpha) // if spx&0x80000000 is set, the entire 0xFF000000 must be set, tpx = spx; // so we can just copy that, and save ourselves an OR else if (newalpha != (uint32_t)-1) tpx = newalpha | (spx&0x00FFFFFF); else tpx = spx; } targetpx[xx] = tpx; // don't inline this into the above, always writing lets compilers vectorize better } } }
// Hadamard transform // Returns the difference between the weighted sum of the absolute value of // transformed coefficients. static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB, const uint16_t* const w) { int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); const __m128i three = _mm_set1_epi16(3); // Load, combine and tranpose inputs. { const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]); const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]); const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]); const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]); const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]); const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]); const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]); const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]); // Combine inA and inB (we'll do two transforms in parallel). const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0); const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1); const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2); const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3); // a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0 // a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0 // a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0 // a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0 // Transpose the two 4x4, discarding the filling zeroes. const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2); const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3); // a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23 // a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33 const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); // a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33 // Convert to 16b. tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero); tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero); tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero); tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Horizontal pass and subsequent transpose. { // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2); const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2); const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2); const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2); // b0_extra = (a0 != 0); const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one); const __m128i b0_base = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); const __m128i b0 = _mm_add_epi16(b0_base, b0_extra); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1); const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3); const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1); const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Vertical pass and difference of weighted sums. { // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so // we can use _mm_load_si128 instead of _mm_loadu_si128. const __m128i w_0 = _mm_loadu_si128((__m128i*)&w[0]); const __m128i w_8 = _mm_loadu_si128((__m128i*)&w[8]); // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // Separate the transforms of inA and inB. __m128i A_b0 = _mm_unpacklo_epi64(b0, b1); __m128i A_b2 = _mm_unpacklo_epi64(b2, b3); __m128i B_b0 = _mm_unpackhi_epi64(b0, b1); __m128i B_b2 = _mm_unpackhi_epi64(b2, b3); { // sign(b) = b >> 15 (0x0000 if positive, 0xffff if negative) const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15); const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15); const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15); const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15); // b = abs(b) = (b ^ sign) - sign A_b0 = _mm_xor_si128(A_b0, sign_A_b0); A_b2 = _mm_xor_si128(A_b2, sign_A_b2); B_b0 = _mm_xor_si128(B_b0, sign_B_b0); B_b2 = _mm_xor_si128(B_b2, sign_B_b2); A_b0 = _mm_sub_epi16(A_b0, sign_A_b0); A_b2 = _mm_sub_epi16(A_b2, sign_A_b2); B_b0 = _mm_sub_epi16(B_b0, sign_B_b0); B_b2 = _mm_sub_epi16(B_b2, sign_B_b2); } // b = abs(b) + 3 A_b0 = _mm_add_epi16(A_b0, three); A_b2 = _mm_add_epi16(A_b2, three); B_b0 = _mm_add_epi16(B_b0, three); B_b2 = _mm_add_epi16(B_b2, three); // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3 // b = (abs(b) + 3) >> 3 A_b0 = _mm_srai_epi16(A_b0, 3); A_b2 = _mm_srai_epi16(A_b2, 3); B_b0 = _mm_srai_epi16(B_b0, 3); B_b2 = _mm_srai_epi16(B_b2, 3); // weighted sums A_b0 = _mm_madd_epi16(A_b0, w_0); A_b2 = _mm_madd_epi16(A_b2, w_8); B_b0 = _mm_madd_epi16(B_b0, w_0); B_b2 = _mm_madd_epi16(B_b2, w_8); A_b0 = _mm_add_epi32(A_b0, A_b2); B_b0 = _mm_add_epi32(B_b0, B_b2); // difference of weighted sums A_b0 = _mm_sub_epi32(A_b0, B_b0); _mm_storeu_si128((__m128i*)&sum[0], A_b0); } return sum[0] + sum[1] + sum[2] + sum[3]; }
static void GF_FUNC_ALIGN VS_CC proc_16bit_sse2(convolution_hv_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t *p0 = (uint16_t *)buff + 8; uint16_t *p1 = p0 + bstride; uint16_t *p2 = p1 + bstride; uint16_t *p3 = p2 + bstride; uint16_t *p4 = p3 + bstride; uint16_t *orig = p0, *end = p4; line_copy16(p0, srcp + 2 * stride, width, 2); line_copy16(p1, srcp + stride, width, 2); line_copy16(p2, srcp, width, 2); srcp += stride; line_copy16(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128i all1 = _mm_cmpeq_epi32(zero, zero); __m128i one = _mm_srli_epi32(all1, 31); __m128 rdiv_h = _mm_set1_ps((float)ch->rdiv_h); __m128 rdiv_v = _mm_set1_ps((float)ch->rdiv_v); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i matrix_h[5]; __m128i matrix_v[5]; int sign_h[5]; int sign_v[5]; for (int i = 0; i < 5; i++) { sign_h[i] = ch->m_h[i] < 0 ? 1 : 0; sign_v[i] = ch->m_v[i] < 0 ? 1 : 0; uint16_t val = sign_h[i] ? (uint16_t)(ch->m_h[i] * -1) : (uint16_t)ch->m_h[i]; matrix_h[i] = _mm_set1_epi16((int16_t)val); val = sign_v[i] ? (uint16_t)(ch->m_v[i] * -1) : (uint16_t)ch->m_v[i]; matrix_v[i] = _mm_set1_epi16((int16_t)val); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy16(p4, srcp, width, 2); for (int x = 0; x < width; x += 8) { uint16_t *array[] = { p0 + x, p1 + x, p2 + x, p3 + x, p4 + x, p2 + x - 2, p2 + x - 1, dstp + x, p2 + x + 1, p2 + x + 2 }; for (int j = 0; j < 2; j++) { __m128i *matrix = j == 0 ? matrix_v : matrix_h; int *sign = j == 0 ? sign_v : sign_h; __m128 rdiv = j == 0 ? rdiv_v : rdiv_h; __m128i sum[2]; sum[0] = _mm_setzero_si128(); sum[1] = _mm_setzero_si128(); for (int i = 0; i < 5; i++) { __m128i xmm0, xmm1, xmm2; xmm0 = _mm_loadu_si128((__m128i *)array[i + j * 5]); xmm1 = _mm_mullo_epi16(xmm0, matrix[i]); xmm0 = _mm_mulhi_epu16(xmm0, matrix[i]); xmm2 = _mm_unpacklo_epi16(xmm1, xmm0); xmm0 = _mm_unpackhi_epi16(xmm1, xmm0); if (sign[i]) { xmm2 = _mm_add_epi32(one, _mm_xor_si128(xmm2, all1)); xmm0 = _mm_add_epi32(one, _mm_xor_si128(xmm0, all1)); } sum[0] = _mm_add_epi32(sum[0], xmm2); sum[1] = _mm_add_epi32(sum[1], xmm0); } for (int i = 0; i < 2; i++) { __m128 sumfp; __m128i mask, temp; sumfp = _mm_cvtepi32_ps(sum[i]); sumfp = _mm_mul_ps(sumfp, rdiv); if (j == 1) { sumfp = _mm_add_ps(sumfp, bias); } sum[i] = _mm_cvttps_epi32(sumfp); temp = _mm_srli_epi32(all1, 16); mask = _mm_cmplt_epi32(sum[i], temp); sum[i] = _mm_or_si128(_mm_and_si128(sum[i], mask), _mm_andnot_si128(mask, temp)); mask = _mm_cmpgt_epi32(sum[i], zero); if (ch->saturate) { sum[i] = _mm_and_si128(mask, sum[i]); } else { temp = _mm_add_epi32(one, _mm_xor_si128(sum[i], all1)); sum[i] = _mm_or_si128(_mm_and_si128(mask, sum[i]), _mm_andnot_si128(mask, temp)); } } sum[0] = mm_cast_epi32(sum[0], sum[1]); _mm_store_si128((__m128i *)(dstp + x), sum[0]); } } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
/* * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP) * * Notice: * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST * numbers of DD bit * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two * - don't support ol_flags for rss and csum err */ static inline uint16_t _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union ixgbe_adv_rx_desc *rxdp; struct ixgbe_rx_entry *sw_ring; uint16_t nb_pkts_recd; int pos; uint64_t var; __m128i shuf_msk; __m128i crc_adjust = _mm_set_epi16( 0, 0, 0, /* ignore non-length fields */ -rxq->crc_len, /* sub crc on data_len */ 0, /* ignore high-16bits of pkt_len */ -rxq->crc_len, /* sub crc on pkt_len */ 0, 0 /* ignore pkt_type field */ ); __m128i dd_check, eop_check; /* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */ nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST); /* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP); /* Just the act of getting into the function from the application is * going to cost about 7 cycles */ rxdp = rxq->rx_ring + rxq->rx_tail; _mm_prefetch((const void *)rxdp, _MM_HINT_T0); /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ if (rxq->rxrearm_nb > RTE_IXGBE_RXQ_REARM_THRESH) ixgbe_rxq_rearm(rxq); /* Before we start moving massive data around, check to see if * there is actually a packet available */ if (!(rxdp->wb.upper.status_error & rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD))) return 0; /* 4 packets DD mask */ dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL); /* 4 packets EOP mask */ eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL); /* mask to shuffle from desc. to mbuf */ shuf_msk = _mm_set_epi8( 7, 6, 5, 4, /* octet 4~7, 32bits rss */ 15, 14, /* octet 14~15, low 16 bits vlan_macip */ 13, 12, /* octet 12~13, 16 bits data_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 13, 12, /* octet 12~13, low 16 bits pkt_len */ 0xFF, 0xFF, /* skip 32 bit pkt_type */ 0xFF, 0xFF ); /* Cache is empty -> need to scan the buffer rings, but first move * the next 'n' mbufs into the cache */ sw_ring = &rxq->sw_ring[rxq->rx_tail]; /* A. load 4 packet in one loop * [A*. mask out 4 unused dirty field in desc] * B. copy 4 mbuf point from swring to rx_pkts * C. calc the number of DD bits among the 4 packets * [C*. extract the end-of-packet bit, if requested] * D. fill info. from desc to mbuf */ for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; pos += RTE_IXGBE_DESCS_PER_LOOP, rxdp += RTE_IXGBE_DESCS_PER_LOOP) { __m128i descs[RTE_IXGBE_DESCS_PER_LOOP]; __m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; __m128i zero, staterr, sterr_tmp1, sterr_tmp2; __m128i mbp1, mbp2; /* two mbuf pointer in one XMM reg. */ /* B.1 load 1 mbuf point */ mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]); /* Read desc statuses backwards to avoid race condition */ /* A.1 load 4 pkts desc */ descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3)); /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); /* B.1 load 1 mbuf point */ mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos+2]); descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2)); /* B.1 load 2 mbuf point */ descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1)); descs[0] = _mm_loadu_si128((__m128i *)(rxdp)); /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2); if (split_packet) { rte_mbuf_prefetch_part2(rx_pkts[pos]); rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); } /* avoid compiler reorder optimization */ rte_compiler_barrier(); /* D.1 pkt 3,4 convert format from desc to pktmbuf */ pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk); pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk); /* D.1 pkt 1,2 convert format from desc to pktmbuf */ pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk); pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]); /* C.1 4=>2 filter staterr info only */ sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]); /* set ol_flags with vlan packet type */ desc_to_olflags_v(descs, &rx_pkts[pos]); /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust); pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust); /* C.2 get 4 pkts staterr value */ zero = _mm_xor_si128(dd_check, dd_check); staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); /* D.3 copy final 3,4 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1, pkt_mb4); _mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1, pkt_mb3); /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust); pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust); /* C* extract and record EOP bit */ if (split_packet) { __m128i eop_shuf_mask = _mm_set_epi8( 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x04, 0x0C, 0x00, 0x08 ); /* and with mask to extract bits, flipping 1-0 */ __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); /* the staterr values are not in order, as the count * count of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); /* store the resulting 32-bit value */ *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); split_packet += RTE_IXGBE_DESCS_PER_LOOP; /* zero-out next pointers */ rx_pkts[pos]->next = NULL; rx_pkts[pos + 1]->next = NULL; rx_pkts[pos + 2]->next = NULL; rx_pkts[pos + 3]->next = NULL; } /* C.3 calc available number of desc */ staterr = _mm_and_si128(staterr, dd_check); staterr = _mm_packs_epi32(staterr, zero); /* D.3 copy final 1,2 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1, pkt_mb2); _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1); /* C.4 calc avaialbe number of desc */ var = __builtin_popcountll(_mm_cvtsi128_si64(staterr)); nb_pkts_recd += var; if (likely(var != RTE_IXGBE_DESCS_PER_LOOP)) break; } /* Update our internal tail pointer */ rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); return nb_pkts_recd; }
static void GF_FUNC_ALIGN VS_CC proc_8bit_sse2(convolution_hv_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *dstp, const uint8_t *srcp) { uint8_t *p0 = buff + 16; uint8_t *p1 = p0 + bstride; uint8_t *p2 = p1 + bstride; uint8_t *p3 = p2 + bstride; uint8_t *p4 = p3 + bstride; uint8_t *orig = p0, *end = p4; line_copy8(p0, srcp + 2 * stride, width, 2); line_copy8(p1, srcp + stride, width, 2); line_copy8(p2, srcp, width, 2); srcp += stride; line_copy8(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128i all1 = _mm_cmpeq_epi32(zero, zero); __m128i one = _mm_srli_epi16(all1, 15); __m128 rdiv_h = _mm_set1_ps((float)ch->rdiv_h); __m128 rdiv_v = _mm_set1_ps((float)ch->rdiv_v); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i matrix_h[5]; __m128i matrix_v[5]; for (int i = 0; i < 5; i++) { matrix_h[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m_h[i]), zero); matrix_v[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m_v[i]), zero); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy8(p4, srcp, width, 2); for (int x = 0; x < width; x += 16) { uint8_t *array[] = { p0 + x, p1 + x, p2 + x, p3 + x, p4 + x, p2 + x - 2, p2 + x - 1, dstp + x, p2 + x + 1, p2 + x + 2 }; for (int j = 0; j < 2; j++) { __m128i *matrix = j == 0 ? matrix_v : matrix_h; __m128i sum[4]; sum[0] = _mm_setzero_si128(); sum[1] = _mm_setzero_si128(); sum[2] = _mm_setzero_si128(); sum[3] = _mm_setzero_si128(); for (int i = 0; i < 5; i++) { __m128i xmm0, xmm1, xmm2; xmm0 = _mm_loadu_si128((__m128i *)array[i + j * 5]); xmm2 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); xmm1 = _mm_unpackhi_epi16(xmm0, zero); xmm0 = _mm_unpacklo_epi16(xmm0, zero); sum[0] = _mm_add_epi32(sum[0], _mm_madd_epi16(xmm0, matrix[i])); sum[1] = _mm_add_epi32(sum[1], _mm_madd_epi16(xmm1, matrix[i])); xmm1 = _mm_unpackhi_epi16(xmm2, zero); xmm0 = _mm_unpacklo_epi16(xmm2, zero); sum[2] = _mm_add_epi32(sum[2], _mm_madd_epi16(xmm0, matrix[i])); sum[3] = _mm_add_epi32(sum[3], _mm_madd_epi16(xmm1, matrix[i])); } for (int i = 0; i < 4; i++) { __m128 sumfp = _mm_cvtepi32_ps(sum[i]); sumfp = _mm_mul_ps(sumfp, j == 0 ? rdiv_v : rdiv_h); if (j == 1) { sumfp = _mm_add_ps(sumfp, bias); } sum[i] = _mm_cvttps_epi32(sumfp); } sum[0] = _mm_packs_epi32(sum[0], sum[1]); sum[1] = _mm_packs_epi32(sum[2], sum[3]); if (!ch->saturate) { for (int i = 0; i < 2; i++) { __m128i mask = _mm_cmplt_epi16(sum[i], zero); __m128i temp = _mm_add_epi16(one, _mm_xor_si128(sum[i], all1)); temp = _mm_and_si128(temp, mask); sum[i] = _mm_andnot_si128(mask, sum[i]); sum[i] = _mm_or_si128(sum[i], temp); } } sum[0] = _mm_packus_epi16(sum[0], sum[1]); _mm_store_si128((__m128i *)(dstp + x), sum[0]); } } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }