static u64 siphash(const u8 key[16], const unsigned char *m, const u64 n) { __m128i v0, v1, v2, v3; __m128i k0, k1; __m128i mi, mask, len; size_t i, k; union { u64 gpr; __m128i xmm; } hash; k0 = _mm_loadl_epi64((__m128i*)(key + 0)); k1 = _mm_loadl_epi64((__m128i*)(key + 8)); v0 = _mm_xor_si128(k0, _mm_set_epi32(0, 0, 0x736f6d65, 0x70736575)); v1 = _mm_xor_si128(k1, _mm_set_epi32(0, 0, 0x646f7261, 0x6e646f6d)); v2 = _mm_xor_si128(k0, _mm_set_epi32(0, 0, 0x6c796765, 0x6e657261)); v3 = _mm_xor_si128(k1, _mm_set_epi32(0, 0, 0x74656462, 0x79746573)); #define HALF_ROUND(a,b,c,d,s,t) \ do \ { \ a = _mm_add_epi64(a, b); c = _mm_add_epi64(c, d); \ b = _mm_roti_epi64(b, s); d = _mm_roti_epi64(d, t); \ b = _mm_xor_si128(b, a); d = _mm_xor_si128(d, c); \ } while(0) #define COMPRESS(v0,v1,v2,v3) \ do \ { \ HALF_ROUND(v0,v1,v2,v3,13,16); \ v0 = _mm_shufflelo_epi16(v0, _MM_SHUFFLE(1,0,3,2)); \ HALF_ROUND(v2,v1,v0,v3,17,21); \ v2 = _mm_shufflelo_epi16(v2, _MM_SHUFFLE(1,0,3,2)); \ } while(0) for(i = 0; i < (n-n%8); i += 8) { mi = _mm_loadl_epi64((__m128i*)(m + i)); v3 = _mm_xor_si128(v3, mi); for(k = 0; k < SIPHASH_ROUNDS; ++k) COMPRESS(v0,v1,v2,v3); v0 = _mm_xor_si128(v0, mi); } mi = _mm_loadl_epi64((__m128i*)(m + i)); len = _mm_set_epi32(0, 0, (n&0xff) << 24, 0); mask = _mm_srli_epi64(_mm_set_epi32(0, 0, 0xffffffff, 0xffffffff), 8*(8-n%8)); mi = _mm_xor_si128(_mm_and_si128(mi, mask), len); v3 = _mm_xor_si128(v3, mi); for(k = 0; k < SIPHASH_ROUNDS; ++k) COMPRESS(v0,v1,v2,v3); v0 = _mm_xor_si128(v0, mi); v2 = _mm_xor_si128(v2, _mm_set_epi32(0, 0, 0, 0xff)); for(k = 0; k < SIPHASH_FINALROUNDS; ++k) COMPRESS(v0,v1,v2,v3); v0 = _mm_xor_si128(_mm_xor_si128(v0, v1), _mm_xor_si128(v2, v3)); hash.xmm = v0; #undef COMPRESS #undef HALF_ROUND //return _mm_extract_epi32(v0, 0) | (((u64)_mm_extract_epi32(v0, 1)) << 32); return hash.gpr; }
// 現在の局面の評価値の内訳を表示する。 void print_eval_stat(Position& pos) { cout << "--- EVAL STAT\n"; Square sq_bk = pos.king_square(BLACK); Square sq_wk = pos.king_square(WHITE); const auto* ppkppb = kpp[sq_bk]; const auto* ppkppw = kpp[Inv(sq_wk)]; auto& pos_ = *const_cast<Position*>(&pos); auto list_fb = pos_.eval_list()->piece_list_fb(); auto list_fw = pos_.eval_list()->piece_list_fw(); int i, j; BonaPiece k0, k1, l0, l1; // 38枚の駒を表示 for (i = 0; i < PIECE_NO_KING; ++i) cout << int(list_fb[i]) << " = " << list_fb[i] << " , " << int(list_fw[i]) << " = " << list_fw[i] << endl; // 評価値の合計 EvalSum sum; // SSE2は少なくとも有るという前提で。 // sum.p[0](BKPP)とsum.p[1](WKPP)をゼロクリア sum.m[0] = _mm_setzero_si128(); // KK sum.p[2] = kk[sq_bk][sq_wk]; cout << "KKC : " << sq_bk << " " << sq_wk << " = " << kk[sq_bk][sq_wk][0] << " + " << kk[sq_bk][sq_wk][1] << "\n"; for (i = 0; i < PIECE_NO_KING; ++i) { k0 = list_fb[i]; k1 = list_fw[i]; const auto* pkppb = ppkppb[k0]; const auto* pkppw = ppkppw[k1]; for (j = 0; j < i; ++j) { l0 = list_fb[j]; l1 = list_fw[j]; #if 0 sum.p[0] += pkppb[l0]; sum.p[1] += pkppw[l1]; #else // SSEによる実装 // pkppw[l1][0],pkppw[l1][1],pkppb[l0][0],pkppb[l0][1]の16bit変数4つを整数拡張で32bit化して足し合わせる __m128i tmp; tmp = _mm_set_epi32(0, 0, *reinterpret_cast<const int32_t*>(&pkppw[l1][0]), *reinterpret_cast<const int32_t*>(&pkppb[l0][0])); tmp = _mm_cvtepi16_epi32(tmp); sum.m[0] = _mm_add_epi32(sum.m[0], tmp); cout << "BKPP : " << sq_bk << " " << k0 << " " << l0 << " = " << pkppb[l0][0] << " + " << pkppb[l0][1] << "\n"; cout << "WKPP : " << sq_wk << " " << k1 << " " << l1 << " = " << pkppw[l1][0] << " + " << pkppw[l1][1] << "\n"; #endif } sum.p[2] += kkp[sq_bk][sq_wk][k0]; cout << "KKP : " << sq_bk << " " << sq_wk << " " << k0 << " = " << kkp[sq_bk][sq_wk][k0][0] << " + " << kkp[sq_bk][sq_wk][k0][1] << "\n"; } cout << "Material = " << pos.state()->materialValue << endl; cout << sum; cout << "---\n"; }
real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; int vdwioffset0; __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B; __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; int nvdwtype; __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m128d one_sixth = _mm_set1_pd(1.0/6.0); __m128d one_twelfth = _mm_set1_pd(1.0/12.0); __m128d dummy_mask,cutoff_mask; __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); __m128d one = _mm_set1_pd(1.0); __m128d two = _mm_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; nvdwtype = fr->ntype; vdwparam = fr->nbfp;
if(Index < MaxSize) { const u32* pData = (const u32 *)(cached_arraybases[ARRAY_POSITION] + (Index * arraystrides[ARRAY_POSITION])); ((u32*)VertexManager::s_pCurBufferPointer)[0] = Common::swap32(pData[0]); ((u32*)VertexManager::s_pCurBufferPointer)[1] = Common::swap32(pData[1]); if (three) ((u32*)VertexManager::s_pCurBufferPointer)[2] = Common::swap32(pData[2]); else ((float*)VertexManager::s_pCurBufferPointer)[2] = 0.0f; LOG_VTX(); VertexManager::s_pCurBufferPointer += 12; } } #if _M_SSE >= 0x301 static const __m128i kMaskSwap32_3 = _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L); static const __m128i kMaskSwap32_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L); template<bool three,int MaxSize> void Pos_ReadIndex_Float_SSSE3(int Index) { if(Index < MaxSize) { const u32* pData = (const u32 *)(cached_arraybases[ARRAY_POSITION] + (Index * arraystrides[ARRAY_POSITION])); GC_ALIGNED128(const __m128i a = _mm_loadu_si128((__m128i*)pData)); GC_ALIGNED128(__m128i b = _mm_shuffle_epi8(a, three ? kMaskSwap32_3 : kMaskSwap32_2)); _mm_storeu_si128((__m128i*)VertexManager::s_pCurBufferPointer, b); LOG_VTX(); VertexManager::s_pCurBufferPointer += 12; } }
void fb_sqrm_low(dig_t *c, const dig_t *a) { __m128i t0, m0, m1, m2, m3, m4, m5, m6, mask; align dig_t t[2*FB_DIGS]; t0 = _mm_set_epi32(0x55545150, 0x45444140, 0x15141110, 0x05040100); mask = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F); m0 = _mm_load_si128((__m128i *)(a)); m1 = _mm_and_si128(m0, mask); m1 = _mm_shuffle_epi8(t0, m1); m2 = _mm_srli_epi64(m0, 4); m2 = _mm_and_si128(m2, mask); m2 = _mm_shuffle_epi8(t0, m2); m3 = _mm_unpacklo_epi8(m1, m2); m4 = _mm_unpackhi_epi8(m1, m2); m0 = _mm_load_si128((__m128i *)(a+2)); m1 = _mm_and_si128(m0, mask); m1 = _mm_shuffle_epi8(t0, m1); m2 = _mm_srli_epi64(m0, 4); m2 = _mm_and_si128(m2, mask); m2 = _mm_shuffle_epi8(t0, m2); m5 = _mm_unpacklo_epi8(m1, m2); m6 = _mm_unpackhi_epi8(m1, m2); m0 = m3; m1 = m4; m2 = m5; m3 = m6; _mm_store_si128((__m128i *) t + 0, m0); _mm_store_si128((__m128i *) t + 1, m1); _mm_store_si128((__m128i *) t + 2, m2); _mm_store_si128((__m128i *) t + 3, m3); const int ra = 52; const int rb = 55; const int rc = 57; const int rh = 59; const int lh = 5; const int la = 12; const int lb = 9; const int lc = 7; dig_t d = t[7], a0 = t[0], a1 = t[1], a2 = t[2], a3 = t[3], a4 = t[4]; a4 ^= (d >> rh); a4 ^= (d >> ra); a4 ^= (d >> rb); a4 ^= (d >> rc); a3 ^= (d << lh); a3 ^= (d << la); a3 ^= (d << lb); a3 ^= (d << lc); d = t[6]; a3 ^= (d >> rh); a3 ^= (d >> ra); a3 ^= (d >> rb); a3 ^= (d >> rc); a2 ^= (d << lh); a2 ^= (d << la); a2 ^= (d << lb); a2 ^= (d << lc); d = t[5]; a2 ^= (d >> rh); a2 ^= (d >> ra); a2 ^= (d >> rb); a2 ^= (d >> rc); a1 ^= (d << lh); a1 ^= (d << la); a1 ^= (d << lb); a1 ^= (d << lc); d = a4; a1 ^= (d >> rh); a1 ^= (d >> ra); a1 ^= (d >> rb); a1 ^= (d >> rc); a0 ^= (d << lh); a0 ^= (d << la); a0 ^= (d << lb); a0 ^= (d << lc); d = a3 >> rh; a0 ^= d; d <<= rh; a0 ^= (d >> ra); a0 ^= (d >> rb); a0 ^= (d >> rc); a3 ^= d; c[3] = a3; c[2] = a2; c[1] = a1; c[0] = a0; return; }
/* blend pixel x color --> dst */ #ifdef BUILD_SSE3 static void _op_blend_p_c_dp_sse3(DATA32 *s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) { DATA32 alpha; const __m128i c_packed = _mm_set_epi32(c, c, c, c); LOOP_ALIGNED_U1_A48(d, l, { /* UOP */ DATA32 sc = MUL4_SYM(c, *s); alpha = 256 - (sc >> 24); *d = sc + MUL_256(alpha, *d); d++; s++; l--; }, { /* A4OP */ __m128i s0 = _mm_lddqu_si128((__m128i *)s); __m128i d0 = _mm_load_si128((__m128i *)d); __m128i sc0 = mul4_sym_sse3(c_packed, s0); __m128i a0 = sub4_alpha_sse3(sc0); __m128i mul0 = mul_256_sse3(a0, d0); d0 = _mm_add_epi32(sc0, mul0); _mm_store_si128((__m128i *)d, d0);
HashReturn Init(hashState *state, int hashbitlen) { int i; if (hashbitlen < 8) return BAD_HASHBITLEN; if (hashbitlen > 512) return BAD_HASHBITLEN; if (hashbitlen != 8 * (hashbitlen / 8)) return BAD_HASHBITLEN; state->hashbitlen = hashbitlen; state->pos = 0; if (hashbitlen == 512) { state->x[0] = _mm_set_epi32(0x0b36e608,0x05b52a93,0x7921fcd6,0xda36534a); state->x[1] = _mm_set_epi32(0xebda27b3,0x50cd5525,0xb58aca24,0xeed070c8); state->x[2] = _mm_set_epi32(0x255496c0,0x94af3e63,0x3fd05131,0x81cc27ae); state->x[3] = _mm_set_epi32(0xbe04628c,0x2b17175b,0x9b08376d,0xeaa9a52a); state->x[4] = _mm_set_epi32(0x945cad1d,0x5460aad2,0x9aff127b,0xcbe8089e); state->x[5] = _mm_set_epi32(0xdc4e051c,0xf44c3ca2,0x589e48c1,0x36b10735); state->x[6] = _mm_set_epi32(0x5fd4b059,0x83527968,0x6b58d4e5,0xc26263a8); state->x[7] = _mm_set_epi32(0x93ea938a,0x9c907347,0x19c63a7d,0x46a3c3fa); } else if (hashbitlen == 256) { state->x[0] = _mm_set_epi32(0x64176600,0xdbfeabb5,0x89530944,0xb64a8504); state->x[1] = _mm_set_epi32(0x7cfb8b3c,0xf60b509f,0xbfd29d06,0xb2010492); state->x[2] = _mm_set_epi32(0x4a81c613,0xa06a6f57,0xf13bba5b,0x1be0a20b); state->x[3] = _mm_set_epi32(0xb706819f,0x032b6e1d,0x8faaa641,0x1224ae2d); state->x[4] = _mm_set_epi32(0x1d3afa60,0x2efd9495,0xc17d2a27,0x0c200b55); state->x[5] = _mm_set_epi32(0xe7216507,0xd95ab1b4,0x1d301585,0x4a3d926b); state->x[6] = _mm_set_epi32(0x99cbf61b,0xed946c6f,0x9b7537a6,0x843b37d6); state->x[7] = _mm_set_epi32(0x3bfdcd0b,0xd9afab81,0x75cc9745,0x37d57956); } else { for (i = 0;i < 8;++i) state->x[i] = _mm_set_epi32(0,0,0,0); state->x[0] = _mm_set_epi32(0,CUBEHASH_ROUNDS,CUBEHASH_BLOCKBYTES,hashbitlen / 8); transform(state,10 * CUBEHASH_ROUNDS); } return SUCCESS; }
#endif #endif int32_t i = 0; int32_t j = 0; int32_t end_query = 0; int32_t end_ref = 0; int32_t score = NEG_INF; __m128i vNegInf = _mm_set1_epi32(NEG_INF); __m128i vNegInf0 = _mm_srli_si128(vNegInf, 4); /* shift in a 0 */ __m128i vOpen = _mm_set1_epi32(open); __m128i vGap = _mm_set1_epi32(gap); __m128i vZero = _mm_set1_epi32(0); __m128i vOne = _mm_set1_epi32(1); __m128i vN = _mm_set1_epi32(N); __m128i vNegOne = _mm_set1_epi32(-1); __m128i vI = _mm_set_epi32(0,1,2,3); __m128i vJreset = _mm_set_epi32(0,-1,-2,-3); __m128i vMax = vNegInf; __m128i vEndI = vNegInf; __m128i vEndJ = vNegInf; __m128i vILimit = _mm_set1_epi32(s1Len); __m128i vJLimit = _mm_set1_epi32(s2Len); /* convert _s1 from char to int in range 0-23 */ for (i=0; i<s1Len; ++i) { s1[i] = matrix->mapper[(unsigned char)_s1[i]]; } /* pad back of s1 with dummy values */ for (i=s1Len; i<s1Len_PAD; ++i) { s1[i] = 0; /* point to first matrix row because we don't care */
void fb_slvn_low(dig_t *c, const dig_t *a) { int i; dig_t *p, u0, u1, u2, u3; void *tab = fb_poly_get_slv(); __m128i m0, m1, m2, m3, m4, sqrt0, sqrt1, mask0, mask1, mask2, r0, r1, t0, t1, perm; perm = _mm_set_epi32(0x0F0D0B09, 0x07050301, 0x0E0C0A08, 0x06040200); mask2 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000); mask1 = _mm_set_epi32(0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0); mask0 = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F); sqrt0 = _mm_set_epi32(0x03020302, 0x01000100, 0x03020302, 0x01000100); sqrt1 = _mm_set_epi32(0x0c080c08, 0x04000400, 0x0c080c08, 0x04000400); t0 = _mm_load_si128((__m128i *)a); t1 = _mm_load_si128((__m128i *)(a + 2)); r0 = r1 = _mm_setzero_si128(); m0 = _mm_shuffle_epi8(t1, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_slli_si128(m1, 8); m1 = _mm_and_si128(m1, mask2); m1 = _mm_slli_epi64(m1, 4); m1 = _mm_xor_si128(m1, m2); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m0 = _mm_and_si128(t0, mask2); m0 = _mm_shuffle_epi8(m0, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_srli_si128(m1, 8); m1 = _mm_andnot_si128(mask2, m1); m2 = _mm_slli_epi64(m2, 4); m1 = _mm_xor_si128(m1, m2); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_si128(t0, 4); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0xFFFFFFFF)); m0 = _mm_shuffle_epi8(m1, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_slli_si128(m1, 8); m1 = _mm_slli_epi64(m1, 4); m1 = _mm_xor_si128(m1, m2); m1 = _mm_srli_si128(m1, 6); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_si128(t0, 2); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0xFFFF)); m0 = _mm_shuffle_epi8(m1, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_slli_si128(m1, 8); m1 = _mm_slli_epi64(m1, 4); m1 = _mm_xor_si128(m1, m2); m1 = _mm_srli_si128(m1, 7); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_si128(t0, 1); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x55)); m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 1)); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x33)); m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 2)); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x0F)); m1 = _mm_slli_epi64(m1, 4); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_epi64(t0, 4); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x5)); m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 1)); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x3)); m1 = _mm_slli_epi64(m1, 2); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_epi64(t0, 2); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x1)); m1 = _mm_slli_epi64(m1, 1); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); sqrt0 = _mm_set_epi32(0x03030202, 0x03030202, 0x01010000, 0x01010000); sqrt1 = _mm_set_epi32(0x0C0C0808, 0x0C0C0808, 0x04040000, 0x04040000); m1 = _mm_and_si128(t0, mask0); m2 = _mm_and_si128(t0, mask1); m3 = _mm_and_si128(t1, mask0); m4 = _mm_and_si128(t1, mask1); m2 = _mm_srli_epi64(m2, 4); m4 = _mm_srli_epi64(m4, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m4 = _mm_shuffle_epi8(sqrt1, m4); m3 = _mm_shuffle_epi8(sqrt0, m3); m1 = _mm_or_si128(m1, m2); m3 = _mm_or_si128(m3, m4); #ifndef __PCLMUL__ align dig_t x[2]; _mm_store_si128((__m128i *)x, m1); u0 = x[0]; u1 = x[1]; _mm_store_si128((__m128i *)x, m3); u2 = x[0]; u3 = x[1]; #else u0 = _mm_extract_epi64(m1, 0); u1 = _mm_extract_epi64(m1, 1); u2 = _mm_extract_epi64(m3, 0); u3 = _mm_extract_epi64(m3, 1); #endif for (i = 0; i < 8; i++) { p = (dig_t *)(tab + (16 * i + (u0 & 0x0F)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u0 >>= 8; p = (dig_t *)(tab + (16 * (i + 8) + (u1 & 0x0F)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u1 >>= 8; p = (dig_t *)(tab + (16 * (i + 16) + (u2 & 0x0F)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u2 >>= 8; p = (dig_t *)(tab + (16 * (i + 24) + (u3 & 0xF)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u3 >>= 8; } _mm_store_si128((__m128i *)c, r0); _mm_store_si128((__m128i *)(c + 2), r1); }
==================== TransformJoints ==================== */ static void TransformJoints( idJointMat *__restrict outJoints, const int numJoints, const idJointMat *__restrict inJoints1, const idJointMat *__restrict inJoints2 ) { float * outFloats = outJoints->ToFloatPtr(); const float * inFloats1 = inJoints1->ToFloatPtr(); const float * inFloats2 = inJoints2->ToFloatPtr(); assert_16_byte_aligned( outFloats ); assert_16_byte_aligned( inFloats1 ); assert_16_byte_aligned( inFloats2 ); const __m128 mask_keep_last = __m128c( _mm_set_epi32( 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 ) ); for ( int i = 0; i < numJoints; i += 2, inFloats1 += 2 * 12, inFloats2 += 2 * 12, outFloats += 2 * 12 ) { __m128 m1a0 = _mm_load_ps( inFloats1 + 0 * 12 + 0 ); __m128 m1b0 = _mm_load_ps( inFloats1 + 0 * 12 + 4 ); __m128 m1c0 = _mm_load_ps( inFloats1 + 0 * 12 + 8 ); __m128 m1a1 = _mm_load_ps( inFloats1 + 1 * 12 + 0 ); __m128 m1b1 = _mm_load_ps( inFloats1 + 1 * 12 + 4 ); __m128 m1c1 = _mm_load_ps( inFloats1 + 1 * 12 + 8 ); __m128 m2a0 = _mm_load_ps( inFloats2 + 0 * 12 + 0 ); __m128 m2b0 = _mm_load_ps( inFloats2 + 0 * 12 + 4 ); __m128 m2c0 = _mm_load_ps( inFloats2 + 0 * 12 + 8 ); __m128 m2a1 = _mm_load_ps( inFloats2 + 1 * 12 + 0 ); __m128 m2b1 = _mm_load_ps( inFloats2 + 1 * 12 + 4 ); __m128 m2c1 = _mm_load_ps( inFloats2 + 1 * 12 + 8 );
bool scanhash_sse2_32(struct thr_info*thr, const unsigned char *pmidstate, unsigned char *pdata, unsigned char *phash1, unsigned char *phash, const unsigned char *ptarget, uint32_t max_nonce, uint32_t *last_nonce, uint32_t nonce) { uint32_t *hash32 = (uint32_t *)phash; uint32_t *nNonce_p = (uint32_t *)(pdata + 76); uint32_t m_midstate[8], m_w[16], m_w1[16]; __m128i m_4w[64] __attribute__ ((aligned (0x100))); __m128i m_4hash[64] __attribute__ ((aligned (0x100))); __m128i m_4hash1[64] __attribute__ ((aligned (0x100))); __m128i offset; int i; pdata += 64; /* Message expansion */ memcpy(m_midstate, pmidstate, sizeof(m_midstate)); memcpy(m_w, pdata, sizeof(m_w)); /* The 2nd half of the data */ memcpy(m_w1, phash1, sizeof(m_w1)); memset(m_4hash, 0, sizeof(m_4hash)); /* Transmongrify */ for (i = 0; i < 16; i++) m_4w[i] = _mm_set1_epi32(m_w[i]); for (i = 0; i < 16; i++) m_4hash1[i] = _mm_set1_epi32(m_w1[i]); for (i = 0; i < 64; i++) sha256_consts_m128i[i] = _mm_set1_epi32(g_sha256_k[i]); offset = _mm_set_epi32(0x3, 0x2, 0x1, 0x0); for (;;) { int j; m_4w[3] = _mm_add_epi32(offset, _mm_set1_epi32(nonce)); /* Some optimization can be done here W.R.T. precalculating some hash */ CalcSha256_x86 (m_4hash1, m_4w, m_midstate); CalcSha256_x86 (m_4hash, m_4hash1, sha256_32init); for (j = 0; j < 4; j++) { if (unlikely(((uint32_t *)&(m_4hash[7]))[j] == 0)) { /* We found a hit...so check it */ /* Use the C version for a check... */ for (i = 0; i < 8; i++) { *(uint32_t *)&(phash)[i<<2] = ((uint32_t *)&(m_4hash[i]))[j]; } if (unlikely(hash32[7] == 0 && fulltest(phash, ptarget))) { nonce += j; *last_nonce = nonce; *nNonce_p = nonce; return true; } } } if (unlikely((nonce >= max_nonce) || thr->work_restart)) { *last_nonce = nonce; return false; } nonce += 4; } }
u64 hashable_siphash24_sse2(u64 ik0, u64 ik1, const u8 *m, size_t n) { __m128i v0, v1, v2, v3; __m128i k0, k1; __m128i mi, mask, len; size_t i, k; union { u64 gpr; __m128i xmm; } hash; const u8 *p; /* We used to use the _mm_seti_epi32 intrinsic to initialize SSE2 registers. This compiles to a movdqa instruction, which requires 16-byte alignment. On 32-bit Windows, it looks like ghc's runtime linker doesn't align ".rdata" sections as requested, so we got segfaults for our trouble. Now we use an intrinsic that cares less about alignment (_mm_loadu_si128, aka movdqu) instead, and all seems happy. */ static const u32 const iv[6][4] = { { 0x70736575, 0x736f6d65, 0, 0 }, { 0x6e646f6d, 0x646f7261, 0, 0 }, { 0x6e657261, 0x6c796765, 0, 0 }, { 0x79746573, 0x74656462, 0, 0 }, { -1, -1, 0, 0 }, { 255, 0, 0, 0 }, }; k0 = _mm_loadl_epi64((__m128i*)(&ik0)); k1 = _mm_loadl_epi64((__m128i*)(&ik1)); v0 = _mm_xor_si128(k0, _mm_loadu_si128((__m128i*) &iv[0])); v1 = _mm_xor_si128(k1, _mm_loadu_si128((__m128i*) &iv[1])); v2 = _mm_xor_si128(k0, _mm_loadu_si128((__m128i*) &iv[2])); v3 = _mm_xor_si128(k1, _mm_loadu_si128((__m128i*) &iv[3])); #define HALF_ROUND(a,b,c,d,s,t) \ do \ { \ a = _mm_add_epi64(a, b); c = _mm_add_epi64(c, d); \ b = _mm_roti_epi64(b, s); d = _mm_roti_epi64(d, t); \ b = _mm_xor_si128(b, a); d = _mm_xor_si128(d, c); \ } while(0) #define COMPRESS(v0,v1,v2,v3) \ do \ { \ HALF_ROUND(v0,v1,v2,v3,13,16); \ v0 = _mm_shufflelo_epi16(v0, _MM_SHUFFLE(1,0,3,2)); \ HALF_ROUND(v2,v1,v0,v3,17,21); \ v2 = _mm_shufflelo_epi16(v2, _MM_SHUFFLE(1,0,3,2)); \ } while(0) for(i = 0; i < (n-n%8); i += 8) { mi = _mm_loadl_epi64((__m128i*)(m + i)); v3 = _mm_xor_si128(v3, mi); if (SIPHASH_ROUNDS == 2) { COMPRESS(v0,v1,v2,v3); COMPRESS(v0,v1,v2,v3); } else { for (k = 0; k < SIPHASH_ROUNDS; ++k) COMPRESS(v0,v1,v2,v3); } v0 = _mm_xor_si128(v0, mi); } p = m + n; /* We must be careful to not trigger a segfault by reading an unmapped page. So where is the end of our input? */ if (((uintptr_t) p & 4095) == 0) /* Exactly at a page boundary: do not read past the end. */ mi = _mm_setzero_si128(); else if (((uintptr_t) p & 4095) <= 4088) /* Inside a page: safe to read past the end, as we'll mask out any bits we shouldn't have looked at below. */ mi = _mm_loadl_epi64((__m128i*)(m + i)); else /* Within 8 bytes of the end of a page: ensure that our final read re-reads some bytes so that we do not cross the page boundary, then shift our result right so that the re-read bytes vanish. */ mi = _mm_srli_epi64(_mm_loadl_epi64((__m128i*)(((uintptr_t) m + i) & ~7)), 8 * (((uintptr_t) m + i) % 8)); len = _mm_set_epi32(0, 0, (n&0xff) << 24, 0); mask = _mm_srli_epi64(_mm_loadu_si128((__m128i*) &iv[4]), 8*(8-n%8)); mi = _mm_xor_si128(_mm_and_si128(mi, mask), len); v3 = _mm_xor_si128(v3, mi); if (SIPHASH_ROUNDS == 2) { COMPRESS(v0,v1,v2,v3); COMPRESS(v0,v1,v2,v3); } else { for (k = 0; k < SIPHASH_ROUNDS; ++k) COMPRESS(v0,v1,v2,v3); } v0 = _mm_xor_si128(v0, mi); v2 = _mm_xor_si128(v2, _mm_loadu_si128((__m128i*) &iv[5])); if (SIPHASH_FINALROUNDS == 4) { COMPRESS(v0,v1,v2,v3); COMPRESS(v0,v1,v2,v3); COMPRESS(v0,v1,v2,v3); COMPRESS(v0,v1,v2,v3); } else { for (k = 0; k < SIPHASH_FINALROUNDS; ++k) COMPRESS(v0,v1,v2,v3); } v0 = _mm_xor_si128(_mm_xor_si128(v0, v1), _mm_xor_si128(v2, v3)); hash.xmm = v0; #undef COMPRESS #undef HALF_ROUND //return _mm_extract_epi32(v0, 0) | (((u64)_mm_extract_epi32(v0, 1)) << 32); return hash.gpr; }
static inline void desc_to_olflags_v(struct i40e_rx_queue *rxq, __m128i descs[4], struct rte_mbuf **rx_pkts) { const __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer); __m128i rearm0, rearm1, rearm2, rearm3; __m128i vlan0, vlan1, rss, l3_l4e; /* mask everything except RSS, flow director and VLAN flags * bit2 is for VLAN tag, bit11 for flow director indication * bit13:12 for RSS indication. */ const __m128i rss_vlan_msk = _mm_set_epi32( 0x1c03804, 0x1c03804, 0x1c03804, 0x1c03804); const __m128i cksum_mask = _mm_set_epi32( PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD); /* map rss and vlan type to rss hash and vlan flag */ const __m128i vlan_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED, 0, 0, 0, 0); const __m128i rss_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH, 0, 0, 0, 0, PKT_RX_FDIR, 0); const __m128i l3_l4e_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, /* shift right 1 bit to make sure it not exceed 255 */ (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1, (PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1, (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1, PKT_RX_IP_CKSUM_BAD >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1); vlan0 = _mm_unpackhi_epi32(descs[0], descs[1]); vlan1 = _mm_unpackhi_epi32(descs[2], descs[3]); vlan0 = _mm_unpacklo_epi64(vlan0, vlan1); vlan1 = _mm_and_si128(vlan0, rss_vlan_msk); vlan0 = _mm_shuffle_epi8(vlan_flags, vlan1); rss = _mm_srli_epi32(vlan1, 11); rss = _mm_shuffle_epi8(rss_flags, rss); l3_l4e = _mm_srli_epi32(vlan1, 22); l3_l4e = _mm_shuffle_epi8(l3_l4e_flags, l3_l4e); /* then we shift left 1 bit */ l3_l4e = _mm_slli_epi32(l3_l4e, 1); /* we need to mask out the reduntant bits */ l3_l4e = _mm_and_si128(l3_l4e, cksum_mask); vlan0 = _mm_or_si128(vlan0, rss); vlan0 = _mm_or_si128(vlan0, l3_l4e); /* * At this point, we have the 4 sets of flags in the low 16-bits * of each 32-bit value in vlan0. * We want to extract these, and merge them with the mbuf init data * so we can do a single 16-byte write to the mbuf to set the flags * and all the other initialization fields. Extracting the * appropriate flags means that we have to do a shift and blend for * each mbuf before we do the write. */ rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vlan0, 8), 0x10); rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vlan0, 4), 0x10); rearm2 = _mm_blend_epi16(mbuf_init, vlan0, 0x10); rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(vlan0, 4), 0x10); /* write the rearm data and the olflags in one write */ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != offsetof(struct rte_mbuf, rearm_data) + 8); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16)); _mm_store_si128((__m128i *)&rx_pkts[0]->rearm_data, rearm0); _mm_store_si128((__m128i *)&rx_pkts[1]->rearm_data, rearm1); _mm_store_si128((__m128i *)&rx_pkts[2]->rearm_data, rearm2); _mm_store_si128((__m128i *)&rx_pkts[3]->rearm_data, rearm3); }
u64 hashable_siphash24_sse2(u64 ik0, u64 ik1, const u8 *m, size_t n) { __m128i v0, v1, v2, v3; __m128i k0, k1; __m128i mi, mask, len; size_t i, k; union { u64 gpr; __m128i xmm; } hash; /* We used to use the _mm_seti_epi32 intrinsic to initialize SSE2 registers. This compiles to a movdqa instruction, which requires 16-byte alignment. On 32-bit Windows, it looks like ghc's runtime linker doesn't align ".rdata" sections as requested, so we got segfaults for our trouble. Now we use an intrinsic that cares less about alignment (_mm_loadu_si128, aka movdqu) instead, and all seems happy. */ static const u32 const iv[6][4] = { { 0x70736575, 0x736f6d65, 0, 0 }, { 0x6e646f6d, 0x646f7261, 0, 0 }, { 0x6e657261, 0x6c796765, 0, 0 }, { 0x79746573, 0x74656462, 0, 0 }, { -1, -1, 0, 0 }, { 255, 0, 0, 0 }, }; k0 = _mm_loadl_epi64((__m128i*)(&ik0)); k1 = _mm_loadl_epi64((__m128i*)(&ik1)); v0 = _mm_xor_si128(k0, _mm_loadu_si128((__m128i*) &iv[0])); v1 = _mm_xor_si128(k1, _mm_loadu_si128((__m128i*) &iv[1])); v2 = _mm_xor_si128(k0, _mm_loadu_si128((__m128i*) &iv[2])); v3 = _mm_xor_si128(k1, _mm_loadu_si128((__m128i*) &iv[3])); #define HALF_ROUND(a,b,c,d,s,t) \ do \ { \ a = _mm_add_epi64(a, b); c = _mm_add_epi64(c, d); \ b = _mm_roti_epi64(b, s); d = _mm_roti_epi64(d, t); \ b = _mm_xor_si128(b, a); d = _mm_xor_si128(d, c); \ } while(0) #define COMPRESS(v0,v1,v2,v3) \ do \ { \ HALF_ROUND(v0,v1,v2,v3,13,16); \ v0 = _mm_shufflelo_epi16(v0, _MM_SHUFFLE(1,0,3,2)); \ HALF_ROUND(v2,v1,v0,v3,17,21); \ v2 = _mm_shufflelo_epi16(v2, _MM_SHUFFLE(1,0,3,2)); \ } while(0) for(i = 0; i < (n-n%8); i += 8) { mi = _mm_loadl_epi64((__m128i*)(m + i)); v3 = _mm_xor_si128(v3, mi); if (SIPHASH_ROUNDS == 2) { COMPRESS(v0,v1,v2,v3); COMPRESS(v0,v1,v2,v3); } else { for (k = 0; k < SIPHASH_ROUNDS; ++k) COMPRESS(v0,v1,v2,v3); } v0 = _mm_xor_si128(v0, mi); } mi = _mm_loadl_epi64((__m128i*)(m + i)); len = _mm_set_epi32(0, 0, (n&0xff) << 24, 0); mask = _mm_srli_epi64(_mm_loadu_si128((__m128i*) &iv[4]), 8*(8-n%8)); mi = _mm_xor_si128(_mm_and_si128(mi, mask), len); v3 = _mm_xor_si128(v3, mi); if (SIPHASH_ROUNDS == 2) { COMPRESS(v0,v1,v2,v3); COMPRESS(v0,v1,v2,v3); } else { for (k = 0; k < SIPHASH_ROUNDS; ++k) COMPRESS(v0,v1,v2,v3); } v0 = _mm_xor_si128(v0, mi); v2 = _mm_xor_si128(v2, _mm_loadu_si128((__m128i*) &iv[5])); if (SIPHASH_FINALROUNDS == 4) { COMPRESS(v0,v1,v2,v3); COMPRESS(v0,v1,v2,v3); COMPRESS(v0,v1,v2,v3); COMPRESS(v0,v1,v2,v3); } else { for (k = 0; k < SIPHASH_FINALROUNDS; ++k) COMPRESS(v0,v1,v2,v3); } v0 = _mm_xor_si128(_mm_xor_si128(v0, v1), _mm_xor_si128(v2, v3)); hash.xmm = v0; #undef COMPRESS #undef HALF_ROUND //return _mm_extract_epi32(v0, 0) | (((u64)_mm_extract_epi32(v0, 1)) << 32); return hash.gpr; }
void minmax_vec(const uint32_t n, float const* buf, uint32_t* idx_min_, uint32_t* idx_max_, float* min_, float* max_) { // We suppose that pointers are aligned on an 16-byte boundary // Initialise SSE registers __m128i sse_idx_min = _mm_setzero_si128(); __m128i sse_idx_max = _mm_setzero_si128(); __m128 sse_min = _mm_set1_ps(FLT_MAX); __m128 sse_max = _mm_set1_ps(FLT_MIN); // We will unroll the for-loop by for, thus doing // (n/4) iterations. const uint32_t n_sse = n & ~3ULL; __m128i sse_idx = _mm_set_epi32(3, 2, 1, 0); const __m128i sse_4 = _mm_set1_epi32(4); for (uint32_t i = 0; i < n_sse; i += 4) { const __m128 sse_v = _mm_load_ps(&buf[i]); const __m128 sse_cmp_min = _mm_cmplt_ps(sse_v, sse_min); const __m128 sse_cmp_max = _mm_cmpgt_ps(sse_v, sse_max); sse_min = _mm_blendv_ps(sse_min, sse_v, sse_cmp_min); sse_max = _mm_blendv_ps(sse_max, sse_v, sse_cmp_max); sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx, (__m128) sse_cmp_min); sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx, (__m128) sse_cmp_max); sse_idx = _mm_add_epi32(sse_idx, sse_4); } // SSE reduction float __attribute__((aligned(16))) mins[4]; float __attribute__((aligned(16))) maxs[4]; _mm_store_ps(mins, sse_min); _mm_store_ps(maxs, sse_max); float min = mins[0]; float max = maxs[0]; uint32_t idx_min = _mm_extract_epi32(sse_idx_min, 0); uint32_t idx_max = _mm_extract_epi32(sse_idx_max, 0); // Unrolled by GCC for (int i = 1; i < 4; i++) { float v = mins[i]; if (v < min) { min = v; idx_min = _mm_extract_epi32(sse_idx_min, i); } v = maxs[i]; if (v > max) { max = v; idx_max = _mm_extract_epi32(sse_idx_max, i); } } // Epilogue for (uint32_t i = n_sse; i < n; i++) { const float v = buf[i]; if (v < min) { min = v; idx_min = i; } if (v > max) { max = v; idx_max = i; } } *idx_min_ = idx_min; *min_ = min; *idx_max_ = idx_max; *max_ = max; }
static inline double calc_output_single (SINC_FILTER *filter, const increment_t increment, const increment_t start_filter_index) { #ifdef RESAMPLER_SSE_OPT __m128i increment4; __m128 left128,right128; float left,right; #else double left,right; #endif const coeff_t * const __restrict coeffs = filter->coeffs; const float * const __restrict buffer = filter->buffer; increment_t filter_index, max_filter_index ; int data_index, coeff_count; /* Convert input parameters into fixed point. */ max_filter_index = int_to_fp (filter->coeff_half_len) ; /* First apply the left half of the filter. */ filter_index = start_filter_index ; coeff_count = (max_filter_index - filter_index) / increment ; filter_index = filter_index + coeff_count * increment ; data_index = filter->b_current - coeff_count ; #ifdef RESAMPLER_SSE_OPT increment4 = _mm_set_epi32(increment * 3, increment * 2, increment, 0); left128 = _mm_setzero_ps(); while(filter_index >= increment * 3) { #ifdef USE_WINDOWS_CODE __m128i indx = _mm_sub_epi32(_mm_set1_epi32(filter_index), increment4); __m128i fractioni = _mm_and_si128(indx,_mm_set1_epi32(((((increment_t)1) << SHIFT_BITS) - 1))); #else Windows__m128i indx; indx.m128i = _mm_sub_epi32(_mm_set1_epi32(filter_index), increment4); __m128i fractioni = _mm_and_si128(indx.m128i,_mm_set1_epi32(((((increment_t)1) << SHIFT_BITS) - 1))); #endif __m128 icoeff0, icoeff2; // warning that these are uninitialized is okay and its intended, as both high and low 64bit-parts are set below __m128 icoeff,icoeffp1,icoeffd,fraction; #ifdef _DEBUG icoeff0 = icoeff2 = _mm_setzero_ps(); #endif #ifdef USE_WINDOWS_CODE indx = _mm_srai_epi32(indx, SHIFT_BITS); #else indx.m128i = _mm_srai_epi32(indx.m128i, SHIFT_BITS); #endif icoeff0 = _mm_loadh_pi(_mm_loadl_pi(icoeff0, (__m64*)(coeffs + indx.m128i_i32[0])), (__m64*)(coeffs + indx.m128i_i32[1])); icoeff2 = _mm_loadh_pi(_mm_loadl_pi(icoeff2, (__m64*)(coeffs + indx.m128i_i32[2])), (__m64*)(coeffs + indx.m128i_i32[3])); icoeff = _mm_shuffle_ps(icoeff0, icoeff2, _MM_SHUFFLE(2, 0, 2, 0)); icoeffp1 = _mm_shuffle_ps(icoeff0, icoeff2, _MM_SHUFFLE(3, 1, 3, 1)); icoeffd = _mm_sub_ps(icoeffp1, icoeff); fraction = _mm_mul_ps(_mm_cvtepi32_ps(fractioni), _mm_set1_ps((float)INV_FP_ONE)); icoeff = _mm_add_ps(icoeff,_mm_mul_ps(icoeffd, fraction)); left128 = _mm_add_ps(left128,_mm_mul_ps(icoeff, _mm_loadu_ps(buffer + data_index))); data_index += 4; filter_index -= increment * 4; } #endif left = 0.; while (filter_index >= MAKE_INCREMENT_T(0)) { coeff_t fraction = fp_to_float(filter_index); int indx = fp_to_int(filter_index); coeff_t icoeff = coeffs[indx] + fraction * (coeffs[indx + 1] - coeffs[indx]); left += icoeff * buffer[data_index]; filter_index -= increment; data_index++; } /* Now apply the right half of the filter. */ filter_index = increment - start_filter_index ; coeff_count = (max_filter_index - filter_index) / increment ; filter_index = filter_index + coeff_count * increment ; data_index = filter->b_current + 1 + coeff_count ; #ifdef RESAMPLER_SSE_OPT right128 = _mm_setzero_ps(); while (filter_index > increment * 3) { #ifdef USE_WINDOWS_CODE __m128i indx = _mm_sub_epi32(_mm_set1_epi32(filter_index), increment4); __m128i fractioni = _mm_and_si128(indx, _mm_set1_epi32(((((increment_t)1) << SHIFT_BITS) - 1))); #else Windows__m128i indx; indx.m128i = _mm_sub_epi32(_mm_set1_epi32(filter_index), increment4); __m128i fractioni = _mm_and_si128(indx.m128i, _mm_set1_epi32(((((increment_t)1) << SHIFT_BITS) - 1))); #endif __m128 icoeff0, icoeff2; // warning that these are uninitialized is okay and its intended, as both high and low 64bit-parts are set below __m128 icoeff,icoeffp1,icoeffd,fraction,data; #ifdef _DEBUG icoeff0 = icoeff2 = _mm_setzero_ps(); #endif #ifdef USE_WINDOWS_CODE indx = _mm_srai_epi32(indx, SHIFT_BITS); #else indx.m128i = _mm_srai_epi32(indx.m128i, SHIFT_BITS); #endif icoeff0 = _mm_loadh_pi(_mm_loadl_pi(icoeff0, (__m64*)(coeffs + indx.m128i_i32[0])), (__m64*)(coeffs + indx.m128i_i32[1])); icoeff2 = _mm_loadh_pi(_mm_loadl_pi(icoeff2, (__m64*)(coeffs + indx.m128i_i32[2])), (__m64*)(coeffs + indx.m128i_i32[3])); icoeff = _mm_shuffle_ps(icoeff0, icoeff2, _MM_SHUFFLE(2, 0, 2, 0)); icoeffp1 = _mm_shuffle_ps(icoeff0, icoeff2, _MM_SHUFFLE(3, 1, 3, 1)); icoeffd = _mm_sub_ps(icoeffp1, icoeff); fraction = _mm_mul_ps(_mm_cvtepi32_ps(fractioni), _mm_set1_ps((float)INV_FP_ONE)); icoeff = _mm_add_ps(icoeff, _mm_mul_ps(icoeffd, fraction)); data = _mm_loadu_ps(buffer + (data_index - 3)); right128 = _mm_add_ps(right128,_mm_mul_ps(icoeff, _mm_shuffle_ps(data,data,_MM_SHUFFLE(0,1,2,3)))); data_index -= 4; filter_index -= increment * 4; } #endif right = 0.; while (filter_index > MAKE_INCREMENT_T(0)) { coeff_t fraction = fp_to_float(filter_index); int indx = fp_to_int(filter_index); coeff_t icoeff = coeffs[indx] + fraction * (coeffs[indx + 1] - coeffs[indx]); right += icoeff * buffer[data_index]; filter_index -= increment; data_index--; } return ( #ifdef RESAMPLER_SSE_OPT _mm_cvtss_f32(horizontal_add(left128)) + _mm_cvtss_f32(horizontal_add(right128)) + #endif left + right) ; } /* calc_output_single */
double bst_compute_121_m128_aligned4( void*_bst_obj, double* p, double* q, size_t nn ) { segments_t* mem = (segments_t*) _bst_obj; int n, i, r, l_end, l_end_pre, j; double t, e_tmp; double* e = mem->e, *w = mem->w; int* root = mem->r; __m128d v_tmp; __m128d v00, v01, v02, v03; __m128d v10, v11, v12, v13; __m128i v_cur_roots, v_old_roots, v_new_roots; __m128 v_rootmask; // initialization // mem->n = nn; n = nn; // subtractions with n potentially negative. say hello to all the bugs int idx1, idx2, idx3, pad, pad_r; idx1 = (n+1)*(n+2)/2 + n/2; e[idx1] = q[n]; idx1++; pad = 1; // pad contains the padding for row i+1 // for row n it's always 1 for (i = n-1; i >= 0; --i) { idx1 -= 2*(n-i)+1 + pad; idx2 = idx1 + 1; e[idx1] = q[i]; w[idx1] = q[i]; for (j = i+1; j < n+1; ++j,++idx2) { e[idx2] = INFINITY; w[idx2] = w[idx2-1] + p[j-1] + q[j]; } // idx2 now points to the beginning of the next line. idx2 += pad; // padding of line i+1 idx3 = idx1; pad_r = pad; // padding of line r for (r = i; r < n; ++r) { pad_r = !pad_r; // padding of line r+1 // idx2 = IDX(r+1, r+1); idx1 = idx3; l_end = idx2 + (n-r); e_tmp = e[idx1++]; // calculate until a multiple of 8 doubles is left // 8 = 4 * 2 128-bit vectors l_end_pre = idx2 + ((n-r)&3); for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) { t = e_tmp + e[idx2] + w[idx1]; if (t < e[idx1]) { e[idx1] = t; root[idx1] = r; } idx1++; } v_tmp = _mm_set_pd( e_tmp, e_tmp ); // execute the shit for 4 vectors of size 2 v_cur_roots = _mm_set_epi32(r, r, r, r); for( ; idx2 < l_end; idx2 += 4 ) { v01 = _mm_load_pd( &w[idx1 ] ); v11 = _mm_load_pd( &w[idx1+2] ); v00 = _mm_load_pd( &e[idx2 ] ); v01 = _mm_add_pd( v01, v_tmp ); // supoptimal for raw-dependency v10 = _mm_load_pd( &e[idx2+2] ); v11 = _mm_add_pd( v11, v_tmp ); v01 = _mm_add_pd( v01, v00 ); v03 = _mm_load_pd( &e[idx1 ] ); v11 = _mm_add_pd( v11, v10 ); v13 = _mm_load_pd( &e[idx1+2] ); v02 = _mm_cmplt_pd( v01, v03 ); v12 = _mm_cmplt_pd( v11, v13 ); v00 = _mm_or_pd( _mm_and_pd( v02, v01 ), _mm_andnot_pd( v02, v03 )); v10 = _mm_or_pd( _mm_and_pd( v12, v11 ), _mm_andnot_pd( v12, v13 )); _mm_store_pd( &e[idx1 ], v00 ); _mm_store_pd( &e[idx1+2], v10 ); v_rootmask = _mm_shuffle_ps( _mm_castpd_ps( v02 ), _mm_castpd_ps( v12 ), _MM_SHUFFLE(0,2,0,2) ); v_old_roots = _mm_lddqu_si128( &root[idx1] ); v_new_roots = _mm_or_si128( _mm_and_si128( v_cur_roots, _mm_castps_si128( v_rootmask ) ), _mm_andnot_si128( v_old_roots, _mm_castps_si128( v_rootmask ) ) ); _mm_storeu_si128( &root[idx1], v_new_roots ); idx1 += 4; } idx2 += pad_r; idx3++; } pad = !pad; // every other line as padding 0, or 1, respectively } // if n is even, the total number of entries in the first // row of the table is odd, so we need padding return e[n + !(n&1)]; }
} static inline void desc_to_olflags_v(struct i40e_rx_queue *rxq, __m128i descs[4] __rte_unused, struct rte_mbuf **rx_pkts) { const __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer); __m128i rearm0, rearm1, rearm2, rearm3; __m128i vlan0, vlan1, rss, l3_l4e; /* mask everything except RSS, flow director and VLAN flags * bit2 is for VLAN tag, bit11 for flow director indication * bit13:12 for RSS indication. */ const __m128i rss_vlan_msk = _mm_set_epi32( 0x1c03804, 0x1c03804, 0x1c03804, 0x1c03804); const __m128i cksum_mask = _mm_set_epi32( PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD);
void nb_kernel430_sse2_double(int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, double * shiftvec, double * fshift, int * gid, double * pos, double * faction, double * charge, double * p_facel, double * p_krf, double * p_crf, double * Vc, int * type, int * p_ntype, double * vdwparam, double * Vvdw, double * p_tabscale, double * VFtab, double * invsqrta, double * dvda, double * p_gbtabscale, double * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, double * work) { int nri,ntype,nthreads,offset,tj,tj2,nti; int n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid; double facel,krf,crf,tabscl,gbtabscl,vct,vdwt,vgbt,nt1,nt2; double shX,shY,shZ,isai_d,dva; gmx_gbdata_t *gbdata; float * gpol; __m128d ix,iy,iz,jx,jy,jz; __m128d dx,dy,dz,t1,t2,t3; __m128d fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2; __m128d q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj; __m128d Y,F,G,H,Fp,VV,FF,vgb,fijC,fijD,fijR,dvdatmp,dvdasum,vctot,n0d; __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8; __m128d c6,c12,Vvdw6,Vvdw12,Vvdwtmp,Vvdwtot,vgbtot,rinvsq,rinvsix; __m128d fac,tabscale,gbtabscale; __m128i n0,nnn; const __m128d neg = {-1.0,-1.0}; const __m128d zero = {0.0,0.0}; const __m128d half = {0.5,0.5}; const __m128d two = {2.0,2.0}; const __m128d three = {3.0,3.0}; const __m128d six = {6.0,6.0}; const __m128d twelwe = {12.0,12.0}; const __m128i four = _mm_set_epi32(4,4,4,4); gbdata = (gmx_gbdata_t *)work; gpol = gbdata->gpol; nri = *p_nri; ntype = *p_ntype; nthreads = *p_nthreads; facel = (*p_facel) * (1.0 - (1.0/gbdata->gb_epsilon_solvent)); krf = *p_krf; crf = *p_crf; tabscl = *p_tabscale; gbtabscl = *p_gbtabscale; nj1 = 0; /* Splat variables */ fac = _mm_load1_pd(&facel); tabscale = _mm_load1_pd(&tabscl); gbtabscale = _mm_load1_pd(&gbtabscl); /* Keep compiler happy */ Vvdwtmp = _mm_setzero_pd(); Vvdwtot = _mm_setzero_pd(); dvdatmp = _mm_setzero_pd(); dvdaj = _mm_setzero_pd(); isaj = _mm_setzero_pd(); vcoul = _mm_setzero_pd(); vgb = _mm_setzero_pd(); t1 = _mm_setzero_pd(); t2 = _mm_setzero_pd(); t3 = _mm_setzero_pd(); xmm1 = _mm_setzero_pd(); xmm2 = _mm_setzero_pd(); xmm3 = _mm_setzero_pd(); xmm4 = _mm_setzero_pd(); jnr1 = jnr2 = 0; j13 = j23 = 0; for(n=0;n<nri;n++) { is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = jindex[n]; nj1 = jindex[n+1]; offset = (nj1-nj0)%2; ii = iinr[n]; ii3 = ii*3; ix = _mm_set1_pd(shX+pos[ii3+0]); iy = _mm_set1_pd(shX+pos[ii3+1]); iz = _mm_set1_pd(shX+pos[ii3+2]); q = _mm_set1_pd(charge[ii]); iq = _mm_mul_pd(fac,q); isai_d = invsqrta[ii]; isai = _mm_load1_pd(&isai_d); nti = 2*ntype*type[ii]; fix = _mm_setzero_pd(); fiy = _mm_setzero_pd(); fiz = _mm_setzero_pd(); dvdasum = _mm_setzero_pd(); vctot = _mm_setzero_pd(); vgbtot = _mm_setzero_pd(); Vvdwtot = _mm_setzero_pd(); for(k=nj0;k<nj1-offset; k+=2) { jnr1 = jjnr[k]; jnr2 = jjnr[k+1]; j13 = jnr1 * 3; j23 = jnr2 * 3; /* Load coordinates */ xmm1 = _mm_loadu_pd(pos+j13); /* x1 y1 */ xmm2 = _mm_loadu_pd(pos+j23); /* x2 y2 */ xmm5 = _mm_load_sd(pos+j13+2); /* z1 - */ xmm6 = _mm_load_sd(pos+j23+2); /* z2 - */ /* transpose */ jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); jz = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* distances */ dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); /* Load invsqrta */ isaj = _mm_loadl_pd(isaj,invsqrta+jnr1); isaj = _mm_loadh_pd(isaj,invsqrta+jnr2); isaprod = _mm_mul_pd(isai,isaj); /* Load charges */ q = _mm_loadl_pd(q,charge+jnr1); q = _mm_loadh_pd(q,charge+jnr2); qq = _mm_mul_pd(iq,q); vcoul = _mm_mul_pd(qq,rinv); fscal = _mm_mul_pd(vcoul,rinv); qq = _mm_mul_pd(isaprod,qq); qq = _mm_mul_pd(qq,neg); gbscale = _mm_mul_pd(isaprod,gbtabscale); /* Load VdW parameters */ tj = nti+2*type[jnr1]; tj2 = nti+2*type[jnr2]; xmm1 = _mm_loadu_pd(vdwparam+tj); xmm2 = _mm_loadu_pd(vdwparam+tj2); c6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); c12 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* Load dvdaj */ dvdaj = _mm_loadl_pd(dvdaj, dvda+jnr1); dvdaj = _mm_loadh_pd(dvdaj, dvda+jnr2); /* Calculate GB table index */ r = _mm_mul_pd(rsq11,rinv); rt = _mm_mul_pd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_pd(rt,n0d); eps2 = _mm_mul_pd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); H = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,H); vgb = _mm_mul_pd(qq,VV); fijC = _mm_mul_pd(qq,FF); fijC = _mm_mul_pd(fijC,gbscale); dvdatmp = _mm_mul_pd(fijC,r); dvdatmp = _mm_add_pd(vgb,dvdatmp); dvdatmp = _mm_mul_pd(dvdatmp,neg); dvdatmp = _mm_mul_pd(dvdatmp,half); dvdasum = _mm_add_pd(dvdasum,dvdatmp); xmm1 = _mm_mul_pd(dvdatmp,isaj); xmm1 = _mm_mul_pd(xmm1,isaj); dvdaj = _mm_add_pd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); _mm_storeh_pd(dvda+jnr2,dvdaj); vctot = _mm_add_pd(vctot,vcoul); vgbtot = _mm_add_pd(vgbtot,vgb); /* Calculate VDW table index */ rt = _mm_mul_pd(r,tabscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_pd(rt,n0d); eps2 = _mm_mul_pd(eps,eps); nnn = _mm_slli_epi32(n0,3); /* Tabulated VdW interaction - dispersion */ xmm1 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); xmm1 = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,xmm1); Vvdw6 = _mm_mul_pd(c6,VV); fijD = _mm_mul_pd(c6,FF); /* Tabulated VdW interaction - repulsion */ nnn = _mm_add_epi32(nnn,four); xmm1 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); xmm1 = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,xmm1); Vvdw12 = _mm_mul_pd(c12,VV); fijR = _mm_mul_pd(c12,FF); Vvdwtmp = _mm_add_pd(Vvdw12,Vvdw6); Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp); xmm1 = _mm_add_pd(fijD,fijR); xmm1 = _mm_mul_pd(xmm1,tabscale); xmm1 = _mm_add_pd(xmm1,fijC); xmm1 = _mm_sub_pd(xmm1,fscal); fscal = _mm_mul_pd(xmm1,neg); fscal = _mm_mul_pd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_pd(fscal,dx); t2 = _mm_mul_pd(fscal,dy); t3 = _mm_mul_pd(fscal,dz); /* update the i force */ fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); /* accumulate forces from memory */ xmm1 = _mm_loadu_pd(faction+j13); /* fx1 fy1 */ xmm2 = _mm_loadu_pd(faction+j23); /* fx2 fy2 */ xmm5 = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */ xmm6 = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */ /* transpose */ xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */ xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */ xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* subtract partial forces */ xmm5 = _mm_sub_pd(xmm5,t1); xmm6 = _mm_sub_pd(xmm6,t2); xmm7 = _mm_sub_pd(xmm7,t3); xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */ xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* store fx and fy */ _mm_storeu_pd(faction+j13,xmm1); _mm_storeu_pd(faction+j23,xmm2); /* .. then fz */ _mm_storel_pd(faction+j13+2,xmm7); _mm_storel_pd(faction+j23+2,xmm7); } /* In double precision, offset can only be either 0 or 1 */ if(offset!=0) { jnr1 = jjnr[k]; j13 = jnr1*3; jx = _mm_load_sd(pos+j13); jy = _mm_load_sd(pos+j13+1); jz = _mm_load_sd(pos+j13+2); isaj = _mm_load_sd(invsqrta+jnr1); isaprod = _mm_mul_sd(isai,isaj); dvdaj = _mm_load_sd(dvda+jnr1); q = _mm_load_sd(charge+jnr1); qq = _mm_mul_sd(iq,q); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); vcoul = _mm_mul_sd(qq,rinv); fscal = _mm_mul_sd(vcoul,rinv); qq = _mm_mul_sd(isaprod,qq); qq = _mm_mul_sd(qq,neg); gbscale = _mm_mul_sd(isaprod,gbtabscale); /* Load VdW parameters */ tj = nti+2*type[jnr1]; c6 = _mm_load_sd(vdwparam+tj); c12 = _mm_load_sd(vdwparam+tj+1); /* Calculate GB table index */ r = _mm_mul_sd(rsq11,rinv); rt = _mm_mul_sd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_sd(rt,n0d); eps2 = _mm_mul_sd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); H = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,H); vgb = _mm_mul_sd(qq,VV); fijC = _mm_mul_sd(qq,FF); fijC = _mm_mul_sd(fijC,gbscale); dvdatmp = _mm_mul_sd(fijC,r); dvdatmp = _mm_add_sd(vgb,dvdatmp); dvdatmp = _mm_mul_sd(dvdatmp,neg); dvdatmp = _mm_mul_sd(dvdatmp,half); dvdasum = _mm_add_sd(dvdasum,dvdatmp); xmm1 = _mm_mul_sd(dvdatmp,isaj); xmm1 = _mm_mul_sd(xmm1,isaj); dvdaj = _mm_add_sd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); vctot = _mm_add_sd(vctot,vcoul); vgbtot = _mm_add_sd(vgbtot,vgb); /* Calculate VDW table index */ rt = _mm_mul_sd(r,tabscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_sd(rt,n0d); eps2 = _mm_mul_sd(eps,eps); nnn = _mm_slli_epi32(n0,3); /* Tabulated VdW interaction - dispersion */ xmm1 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); xmm1 = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,xmm1); Vvdw6 = _mm_mul_sd(c6,VV); fijD = _mm_mul_sd(c6,FF); /* Tabulated VdW interaction - repulsion */ nnn = _mm_add_epi32(nnn,four); xmm1 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); xmm1 = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,xmm1); Vvdw12 = _mm_mul_sd(c12,VV); fijR = _mm_mul_sd(c12,FF); Vvdwtmp = _mm_add_sd(Vvdw12,Vvdw6); Vvdwtot = _mm_add_sd(Vvdwtot,Vvdwtmp); xmm1 = _mm_add_sd(fijD,fijR); xmm1 = _mm_mul_sd(xmm1,tabscale); xmm1 = _mm_add_sd(xmm1,fijC); xmm1 = _mm_sub_sd(xmm1,fscal); fscal = _mm_mul_sd(xmm1,neg); fscal = _mm_mul_sd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_sd(fscal,dx); t2 = _mm_mul_sd(fscal,dy); t3 = _mm_mul_sd(fscal,dz); /* update the i force */ fix = _mm_add_sd(fix,t1); fiy = _mm_add_sd(fiy,t2); fiz = _mm_add_sd(fiz,t3); /* accumulate forces from memory */ xmm5 = _mm_load_sd(faction+j13); /* fx */ xmm6 = _mm_load_sd(faction+j13+1); /* fy */ xmm7 = _mm_load_sd(faction+j13+2); /* fz */ /* subtract partial forces */ xmm5 = _mm_sub_sd(xmm5,t1); xmm6 = _mm_sub_sd(xmm6,t2); xmm7 = _mm_sub_sd(xmm7,t3); /* store forces */ _mm_store_sd(faction+j13,xmm5); _mm_store_sd(faction+j13+1,xmm6); _mm_store_sd(faction+j13+2,xmm7); } /* fix/fiy/fiz now contain four partial terms, that all should be * added to the i particle forces */ t1 = _mm_unpacklo_pd(t1,fix); t2 = _mm_unpacklo_pd(t2,fiy); t3 = _mm_unpacklo_pd(t3,fiz); fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); fix = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1)); fiy = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1)); fiz = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1)); /* Load i forces from memory */ xmm1 = _mm_load_sd(faction+ii3); xmm2 = _mm_load_sd(faction+ii3+1); xmm3 = _mm_load_sd(faction+ii3+2); /* Add to i force */ fix = _mm_add_sd(fix,xmm1); fiy = _mm_add_sd(fiy,xmm2); fiz = _mm_add_sd(fiz,xmm3); /* store i forces to memory */ _mm_store_sd(faction+ii3,fix); _mm_store_sd(faction+ii3+1,fiy); _mm_store_sd(faction+ii3+2,fiz); /* now do dvda */ dvdatmp = _mm_unpacklo_pd(dvdatmp,dvdasum); dvdasum = _mm_add_pd(dvdasum,dvdatmp); _mm_storeh_pd(&dva,dvdasum); dvda[ii] = dvda[ii] + dva*isai_d*isai_d; ggid = gid[n]; /* Coulomb potential */ vcoul = _mm_unpacklo_pd(vcoul,vctot); vctot = _mm_add_pd(vctot,vcoul); _mm_storeh_pd(&vct,vctot); Vc[ggid] = Vc[ggid] + vct; /* VdW potential */ Vvdwtmp = _mm_unpacklo_pd(Vvdwtmp,Vvdwtot); Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp); _mm_storeh_pd(&vdwt,Vvdwtot); Vvdw[ggid] = Vvdw[ggid] + vdwt; /* GB potential */ vgb = _mm_unpacklo_pd(vgb,vgbtot); vgbtot = _mm_add_pd(vgbtot,vgb); _mm_storeh_pd(&vgbt,vgbtot); gpol[ggid] = gpol[ggid] + vgbt; } *outeriter = nri; *inneriter = nj1; }
Random::Random(int iSeed) : mSeed(iSeed) { mQuadSeed = _mm_set_epi32(iSeed, iSeed+1, iSeed, iSeed+1); }
auto const index = DataRead<I>(); auto const data = reinterpret_cast<const T*>(cached_arraybases[ARRAY_TEXCOORD0 + tcIndex] + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0 + tcIndex])); auto const scale = tcScale[tcIndex]; DataWriter dst; for (int i = 0; i != N; ++i) dst.Write(TCScale(Common::FromBigEndian(data[i]), scale)); LOG_TEX<N>(); ++tcIndex; } #if _M_SSE >= 0x401 static const __m128i kMaskSwap16_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x02030001L); template <typename I> void LOADERDECL TexCoord_ReadIndex_Short2_SSE4() { static_assert(!std::numeric_limits<I>::is_signed, "Only unsigned I is sane!"); // Heavy in ZWW auto const index = DataRead<I>(); const s32 *pData = (const s32*)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0+tcIndex])); const __m128i a = _mm_cvtsi32_si128(*pData); const __m128i b = _mm_shuffle_epi8(a, kMaskSwap16_2); const __m128i c = _mm_cvtepi16_epi32(b); const __m128 d = _mm_cvtepi32_ps(c); const __m128 e = _mm_load1_ps(&tcScale[tcIndex]); const __m128 f = _mm_mul_ps(d, e);
Random::Random() : mSeed(23257132) { mQuadSeed = _mm_set_epi32(mSeed, mSeed+1, mSeed, mSeed+1); }
static void blake256_init( state *S ) { memset(S, 0, sizeof(state)); _mm_store_si128((__m128i*)(&S->h[0]), _mm_set_epi32(0xA54FF53A,0x3C6EF372,0xBB67AE85,0x6A09E667)); _mm_store_si128((__m128i*)(&S->h[4]), _mm_set_epi32(0x5BE0CD19,0x1F83D9AB,0x9B05688C,0x510E527F)); }
int sse_auction_search(int *pr, int *P, int *ai0, int *ai1, int *a0, int *a1, int nodes, int arcs, int s, int t) { int i __attribute__ ((aligned (16))) = 0; int j __attribute__ ((aligned (16))) = t; int k __attribute__ ((aligned (16))) = 0; int m __attribute__ ((aligned (16))) = 0; int maxla __attribute__ ((aligned (32))) = 0; int argmaxla __attribute__ ((aligned (16))) = 0; int cost __attribute__ ((aligned (16))) = 0; int length __attribute__ ((aligned (16))) = 1; int path_cost __attribute__ ((aligned (16))) = 0; uint32_t tmp1, tmp2; int cost_tab[nodes+1]; __m128i a0sse, a1sse, ai0sse, ai1sse, ai1sse1, I, J, K, M, then; __m128i ARCS, MNODES, INFINITE, NEGINF, prsse, Psse, MAXLA, ARGMAXLA, LA, mask1, mask2, mask3, COST; for(i = 0; i <= nodes; i++) { cost_tab[i] = 0; } if(check_s_t(s, t, P, nodes) != 0) { return 1; } while(P[s] == INF) { k = -1; m = -1; //printf("j = %d\n", j); J = _mm_set1_epi32(j); //aktualna wartosc j K = _mm_set1_epi32(-1); //poczatkowy indeks w tablicy z kosztami krawedzi M = _mm_set1_epi32(-1); //koncowy indeks w tablicy z kosztami krawedzi MNODES = _mm_set1_epi32(nodes-1); //liczba wezlow pomniejszona o 1 (do sprawdzenia czy koniec tablicy) ARCS = _mm_set1_epi32(arcs); //liczba krawedzi /* wyliczenie k, m */ for(i = 0; i < nodes; i+=4) { ai0sse = _mm_load_si128((__m128i*) &ai0[i]); //ladowanie ai0 (numerow wezlow) ai1sse = _mm_load_si128((__m128i*) &ai1[i]); //ladowanie ai1 (indeksow w tablicy z krawedziami) ai1sse1 = _mm_set_epi32(ai1[i+4],ai1[i+3],ai1[i+2],ai1[i+1]); //ladowanie indeksow z ai1 przesunietych o 1 mask1 = _mm_cmpeq_epi32(J, ai0sse); //sprawdzenie warunku j == ai0[i] K = _mm_or_si128(_mm_and_si128(mask1,ai1sse), _mm_andnot_si128(mask1,K)); //ustalenie K I = _mm_set_epi32(i+3, i+2, i+1, i); //aktualne wartosci i mask2 = _mm_cmplt_epi32(I, MNODES); //sprawdzenie warunku i == nodes-1 mask3 = _mm_and_si128(mask1,mask2); //sprawdzenie sumy warunkow 1 i 2 then = _mm_or_si128(_mm_and_si128(mask2,ai1sse1), _mm_andnot_si128(mask2,ARCS)); //m = ai1[i+1] lub arcs M = _mm_or_si128(_mm_and_si128(mask3,then), _mm_andnot_si128(mask3,M)); //ustalenie M } for(i = 0; i < nodes; i++) { if(ai0[i] == j) { k = ai1[i]; //k - indeks startowy krawedzi wychodzacych z j //printf("i = %d ", i); if(i < nodes - 1) { m = ai1[i+1]; } else { m = arcs; } } } /* zapisanie k, m */ for(i = 0; i < 4; i++) { tmp1 = get_from_m128i(K,i); tmp2 = get_from_m128i(M,i); if(tmp1 != -1) { k = tmp1; } if(tmp2 != -1) { m = tmp2; } } //printf("K,M: %d %d\n", k, m); /* wybor optymalnej krawedzi */ if(k != -1) { INFINITE = _mm_set1_epi32(INF); //wartosc "nieskonczona" NEGINF = _mm_set1_epi32(0-INF); //wartosc -INF COST = _mm_set1_epi32(cost); //koszt wybranej krawedzi MAXLA = _mm_set1_epi32(0-INF); //maksymalna wartosc la = pr[a0[i]] - a1[i] ARGMAXLA = _mm_set1_epi32(-1); //indeks dla którego la jest najwieksza for(i = k; i < m; i+=4) { a1sse = _mm_set_epi32(a1[i],a1[i+1],a1[i+2],a1[i+3]); //ladowanie a1 a0sse = _mm_set_epi32(a0[i],a0[i+1],a0[i+2],a0[i+3]); //ladowanie a0 prsse = _mm_set_epi32(pr[a0[i]],pr[a0[i+1]],pr[a0[i+2]],pr[a0[i+3]]); //ladowanie pr Psse = _mm_set_epi32(P[a0[i]],P[a0[i+1]],P[a0[i+2]],P[a0[i+3]]); //ladowanie P mask1 = _mm_cmpgt_epi32(_mm_set1_epi32(m),_mm_set_epi32(i,i+1,i+2,i+3)); //czy ostatni obieg prsse = _mm_or_si128(_mm_and_si128(mask1,prsse), _mm_andnot_si128(mask1,NEGINF)); //obciecie cudzych lukow LA = _mm_sub_epi32(prsse, a1sse); //la = pr[a0[i]] - a1[i] then = _mm_max_epi32(LA,MAXLA); //maksymalna wartość la, maxla mask1 = _mm_cmpeq_epi32(Psse,INFINITE); //czy P[i] == INF mask2 = _mm_and_si128(mask1,_mm_cmpgt_epi32(LA,MAXLA)); //czy P[i] == INF i LA > MAXLA MAXLA = _mm_or_si128(_mm_and_si128(mask1,then), _mm_andnot_si128(mask1,MAXLA)); //aktualizacja maxla ARGMAXLA = _mm_or_si128(_mm_and_si128(mask2,a0sse), _mm_andnot_si128(mask2,ARGMAXLA)); //aktualizacja argmaxla COST = _mm_or_si128(_mm_and_si128(mask2,a1sse), _mm_andnot_si128(mask2,COST)); //aktualizacja cost } } /* zapisanie maxla, argmaxla, cost */ maxla = 0 - INF; for(i = 0; i < 4; i++) { tmp1 = get_from_m128i(MAXLA,i); if(tmp1 > maxla) { argmaxla = get_from_m128i(ARGMAXLA,i); maxla = tmp1; cost = get_from_m128i(COST,i); } } //printf("COST: %d, PATH_COST: %d\n", cost, path_cost); //printf("pr[j] = %d, maxla = %d, argmaxla = %d\n", pr[j], maxla, argmaxla); /* skrocenie sciezki */ if(pr[j] > maxla || maxla == -INF) { /* uaktualnienie ceny */ pr[j] = maxla; /* sciezka jednoelementowa nie jest skracana */ if(j != t) { /* uaktualnienie sciezki */ P[j] = INF; length = length - 1; path_cost = path_cost - cost_tab[length]; cost_tab[length] = 0; /* powrot do poprzedniego wierzcholka w sciezce (j), k - odcinany */ k = j; for(i = 0; i < nodes; i++) { if(P[i] == length - 1) { j = i; break; } } } } /* przedluzenie sciezki */ else { P[argmaxla] = length; j = argmaxla; path_cost = path_cost + cost; cost_tab[length] = cost; length = length + 1; /* sciezka doszla do wierzcholka startowego => koniec */ if(argmaxla == s) { printf("dlugosc sciezki: %d\n", path_cost); return 0; } } } return 0; }
void Viterbi::AlignWithOutCellOff(HMMSimd* q, HMMSimd* t,ViterbiMatrix * viterbiMatrix, int maxres, ViterbiResult* result) #endif #endif { // Linear topology of query (and template) HMM: // 1. The HMM HMM has L+2 columns. Columns 1 to L contain // a match state, a delete state and an insert state each. // 2. The Start state is M0, the virtual match state in column i=0 (j=0). (Therefore X[k][0]=ANY) // This column has only a match state and it has only a transitions to the next match state. // 3. The End state is M(L+1), the virtual match state in column i=L+1.(j=L+1) (Therefore X[k][L+1]=ANY) // Column L has no transitions to the delete state: tr[L][M2D]=tr[L][D2D]=0. // 4. Transitions I->D and D->I are ignored, since they do not appear in PsiBlast alignments // (as long as the gap opening penalty d is higher than the best match score S(a,b)). // Pairwise alignment of two HMMs: // 1. Pair-states for the alignment of two HMMs are // MM (Q:Match T:Match) , GD (Q:Gap T:Delete), IM (Q:Insert T:Match), DG (Q:Delelte, T:Match) , MI (Q:Match T:Insert) // 2. Transitions are allowed only between the MM-state and each of the four other states. // Saving space: // The best score ending in pair state XY sXY[i][j] is calculated from left to right (j=1->t->L) // and top to bottom (i=1->q->L). To save space, only the last row of scores calculated is kept in memory. // (The backtracing matrices are kept entirely in memory [O(t->L*q->L)]). // When the calculation has proceeded up to the point where the scores for cell (i,j) are caculated, // sXY[i-1][j'] = sXY[j'] for j'>=j (A below) // sXY[i][j'] = sXY[j'] for j'<j (B below) // sXY[i-1][j-1]= sXY_i_1_j_1 (C below) // sXY[i][j] = sXY_i_j (D below) // j-1 // j // i-1: CAAAAAAAAAAAAAAAAAA // i : BBBBBBBBBBBBBD // Variable declarations const float smin = (this->local ? 0 : -FLT_MAX); //used to distinguish between SW and NW algorithms in maximization const simd_float smin_vec = simdf32_set(smin); const simd_float shift_vec = simdf32_set(shift); // const simd_float one_vec = simdf32_set(1); // 00000001 const simd_int mm_vec = simdi32_set(2); //MM 00000010 const simd_int gd_vec = simdi32_set(3); //GD 00000011 const simd_int im_vec = simdi32_set(4); //IM 00000100 const simd_int dg_vec = simdi32_set(5); //DG 00000101 const simd_int mi_vec = simdi32_set(6); //MI 00000110 const simd_int gd_mm_vec = simdi32_set(8); // 00001000 const simd_int im_mm_vec = simdi32_set(16);// 00010000 const simd_int dg_mm_vec = simdi32_set(32);// 00100000 const simd_int mi_mm_vec = simdi32_set(64);// 01000000 #ifdef VITERBI_SS_SCORE HMM * q_s = q->GetHMM(0); const unsigned char * t_index; if(ss_hmm_mode == HMM::PRED_PRED || ss_hmm_mode == HMM::DSSP_PRED ){ t_index = t->pred_index; }else if(ss_hmm_mode == HMM::PRED_DSSP){ t_index = t->dssp_index; } simd_float * ss_score_vec = (simd_float *) ss_score; #endif #ifdef AVX2 const simd_int shuffle_mask_extract = _mm256_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1); #endif #ifdef VITERBI_CELLOFF const __m128i tmp_vec = _mm_set_epi32(0x40000000,0x00400000,0x00004000,0x00000040);//01000000010000000100000001000000 #ifdef AVX2 const simd_int co_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_vec), tmp_vec, 1); const simd_int float_min_vec = (simd_int) _mm256_set1_ps(-FLT_MAX); const simd_int shuffle_mask_celloff = _mm256_set_epi8( 15, 14, 13, 12, 15, 14, 13, 12, 15, 14, 13, 12, 15, 14, 13, 12, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0); #else // SSE case const simd_int co_vec = tmp_vec; const simd_int float_min_vec = (simd_int) simdf32_set(-FLT_MAX); #endif #endif // AVX2 end int i,j; //query and template match state indices simd_int i2_vec = simdi32_set(0); simd_int j2_vec = simdi32_set(0); simd_float sMM_i_j = simdf32_set(0); simd_float sMI_i_j,sIM_i_j,sGD_i_j,sDG_i_j; simd_float Si_vec; simd_float sMM_i_1_j_1; simd_float sMI_i_1_j_1; simd_float sIM_i_1_j_1; simd_float sGD_i_1_j_1; simd_float sDG_i_1_j_1; simd_float score_vec = simdf32_set(-FLT_MAX); simd_int byte_result_vec = simdi32_set(0); // Initialization of top row, i.e. cells (0,j) for (j=0; j <= t->L; ++j) { const unsigned int index_pos_j = j * 5; sMM_DG_MI_GD_IM_vec[index_pos_j + 0] = simdf32_set(-j*penalty_gap_template); sMM_DG_MI_GD_IM_vec[index_pos_j + 1] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_j + 2] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_j + 3] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_j + 4] = simdf32_set(-FLT_MAX); } // Viterbi algorithm const int queryLength = q->L; for (i=1; i <= queryLength; ++i) // Loop through query positions i { // If q is compared to t, exclude regions where overlap of q with t < min_overlap residues // Initialize cells sMM_i_1_j_1 = simdf32_set(-(i - 1) * penalty_gap_query); // initialize at (i-1,0) sIM_i_1_j_1 = simdf32_set(-FLT_MAX); // initialize at (i-1,jmin-1) sMI_i_1_j_1 = simdf32_set(-FLT_MAX); sDG_i_1_j_1 = simdf32_set(-FLT_MAX); sGD_i_1_j_1 = simdf32_set(-FLT_MAX); // initialize at (i,jmin-1) const unsigned int index_pos_i = 0 * 5; sMM_DG_MI_GD_IM_vec[index_pos_i + 0] = simdf32_set(-i * penalty_gap_query); // initialize at (i,0) sMM_DG_MI_GD_IM_vec[index_pos_i + 1] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_i + 2] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_i + 3] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_i + 4] = simdf32_set(-FLT_MAX); #ifdef AVX2 unsigned long long * sCO_MI_DG_IM_GD_MM_vec = (unsigned long long *) viterbiMatrix->getRow(i); #else unsigned int *sCO_MI_DG_IM_GD_MM_vec = (unsigned int *) viterbiMatrix->getRow(i); #endif const unsigned int start_pos_tr_i_1 = (i - 1) * 7; const unsigned int start_pos_tr_i = (i) * 7; const simd_float q_m2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 2)); // M2M const simd_float q_m2d = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 3)); // M2D const simd_float q_d2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 4)); // D2M const simd_float q_d2d = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 5)); // D2D const simd_float q_i2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 6)); // I2m const simd_float q_i2i = simdf32_load((float *) (q->tr + start_pos_tr_i)); // I2I const simd_float q_m2i = simdf32_load((float *) (q->tr + start_pos_tr_i + 1)); // M2I // Find maximum score; global alignment: maxize only over last row and last column const bool findMaxInnerLoop = (local || i == queryLength); const int targetLength = t->L; #ifdef VITERBI_SS_SCORE if(ss_hmm_mode == HMM::NO_SS_INFORMATION){ // set all to log(1.0) = 0.0 for (j = 0; j <= (targetLength*VEC_SIZE); j++) // Loop through template positions j { ss_score[j] = 0.0; } }else { const float * score; if(ss_hmm_mode == HMM::PRED_PRED){ score = &S33[ (int)q_s->ss_pred[i]][ (int)q_s->ss_conf[i]][0][0]; }else if (ss_hmm_mode == HMM::DSSP_PRED){ score = &S73[ (int)q_s->ss_dssp[i]][0][0]; }else{ score = &S37[ (int)q_s->ss_pred[i]][ (int)q_s->ss_conf[i]][0]; } // access SS scores and write them to the ss_score array for (j = 0; j <= (targetLength*VEC_SIZE); j++) // Loop through template positions j { ss_score[j] = ssw * score[t_index[j]]; } } #endif for (j=1; j <= targetLength; ++j) // Loop through template positions j { simd_int index_vec; simd_int res_gt_vec; // cache line optimized reading const unsigned int start_pos_tr_j_1 = (j-1) * 7; const unsigned int start_pos_tr_j = (j) * 7; const simd_float t_m2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+2)); // M2M const simd_float t_m2d = simdf32_load((float *) (t->tr+start_pos_tr_j_1+3)); // M2D const simd_float t_d2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+4)); // D2M const simd_float t_d2d = simdf32_load((float *) (t->tr+start_pos_tr_j_1+5)); // D2D const simd_float t_i2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+6)); // I2m const simd_float t_i2i = simdf32_load((float *) (t->tr+start_pos_tr_j)); // I2i const simd_float t_m2i = simdf32_load((float *) (t->tr+start_pos_tr_j+1)); // M2I // Find max value // CALCULATE_MAX6( sMM_i_j, // smin, // sMM_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][M2M], // sGD_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][D2M], // sIM_i_1_j_1 + q->tr[i-1][I2M] + t->tr[j-1][M2M], // sDG_i_1_j_1 + q->tr[i-1][D2M] + t->tr[j-1][M2M], // sMI_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][I2M], // bMM[i][j] // ); // same as sMM_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][M2M] simd_float mm_m2m_m2m_vec = simdf32_add( simdf32_add(sMM_i_1_j_1, q_m2m), t_m2m); // if mm > min { 2 } res_gt_vec = (simd_int)simdf32_gt(mm_m2m_m2m_vec, smin_vec); byte_result_vec = simdi_and(res_gt_vec, mm_vec); sMM_i_j = simdf32_max(smin_vec, mm_m2m_m2m_vec); // same as sGD_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][D2M] simd_float gd_m2m_d2m_vec = simdf32_add( simdf32_add(sGD_i_1_j_1, q_m2m), t_d2m); // if gd > max { 3 } res_gt_vec = (simd_int)simdf32_gt(gd_m2m_d2m_vec, sMM_i_j); index_vec = simdi_and( res_gt_vec, gd_vec); byte_result_vec = simdi_or( index_vec, byte_result_vec); sMM_i_j = simdf32_max(sMM_i_j, gd_m2m_d2m_vec); // same as sIM_i_1_j_1 + q->tr[i-1][I2M] + t->tr[j-1][M2M] simd_float im_m2m_d2m_vec = simdf32_add( simdf32_add(sIM_i_1_j_1, q_i2m), t_m2m); // if im > max { 4 } MAX2(im_m2m_d2m_vec, sMM_i_j, im_vec,byte_result_vec); sMM_i_j = simdf32_max(sMM_i_j, im_m2m_d2m_vec); // same as sDG_i_1_j_1 + q->tr[i-1][D2M] + t->tr[j-1][M2M] simd_float dg_m2m_d2m_vec = simdf32_add( simdf32_add(sDG_i_1_j_1, q_d2m), t_m2m); // if dg > max { 5 } MAX2(dg_m2m_d2m_vec, sMM_i_j, dg_vec,byte_result_vec); sMM_i_j = simdf32_max(sMM_i_j, dg_m2m_d2m_vec); // same as sMI_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][I2M], simd_float mi_m2m_d2m_vec = simdf32_add( simdf32_add(sMI_i_1_j_1, q_m2m), t_i2m); // if mi > max { 6 } MAX2(mi_m2m_d2m_vec, sMM_i_j, mi_vec, byte_result_vec); sMM_i_j = simdf32_max(sMM_i_j, mi_m2m_d2m_vec); // TODO add secondary structure score // calculate amino acid profile-profile scores Si_vec = log2f4(ScalarProd20Vec((simd_float *) q->p[i],(simd_float *) t->p[j])); #ifdef VITERBI_SS_SCORE Si_vec = simdf32_add(ss_score_vec[j], Si_vec); #endif Si_vec = simdf32_add(Si_vec, shift_vec); sMM_i_j = simdf32_add(sMM_i_j, Si_vec); //+ ScoreSS(q,t,i,j) + shift + (Sstruc==NULL? 0: Sstruc[i][j]); const unsigned int index_pos_j = (j * 5); const unsigned int index_pos_j_1 = (j - 1) * 5; const simd_float sMM_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 0)); const simd_float sGD_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 3)); const simd_float sIM_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 4)); const simd_float sMM_j = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 0)); const simd_float sDG_j = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 1)); const simd_float sMI_j = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 2)); sMM_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 0)); sDG_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 1)); sMI_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 2)); sGD_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 3)); sIM_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 4)); // sGD_i_j = max2 // ( // sMM[j-1] + t->tr[j-1][M2D], // MM->GD gap opening in query // sGD[j-1] + t->tr[j-1][D2D], // GD->GD gap extension in query // bGD[i][j] // ); //sMM_DG_GD_MI_IM_vec simd_float mm_gd_vec = simdf32_add(sMM_j_1, t_m2d); // MM->GD gap opening in query simd_float gd_gd_vec = simdf32_add(sGD_j_1, t_d2d); // GD->GD gap extension in query // if mm_gd > gd_dg { 8 } MAX2_SET_MASK(mm_gd_vec, gd_gd_vec,gd_mm_vec, byte_result_vec); sGD_i_j = simdf32_max( mm_gd_vec, gd_gd_vec ); // sIM_i_j = max2 // ( // sMM[j-1] + q->tr[i][M2I] + t->tr[j-1][M2M] , // sIM[j-1] + q->tr[i][I2I] + t->tr[j-1][M2M], // IM->IM gap extension in query // bIM[i][j] // ); simd_float mm_mm_vec = simdf32_add(simdf32_add(sMM_j_1, q_m2i), t_m2m); simd_float im_im_vec = simdf32_add(simdf32_add(sIM_j_1, q_i2i), t_m2m); // IM->IM gap extension in query // if mm_mm > im_im { 16 } MAX2_SET_MASK(mm_mm_vec,im_im_vec, im_mm_vec, byte_result_vec); sIM_i_j = simdf32_max( mm_mm_vec, im_im_vec ); // sDG_i_j = max2 // ( // sMM[j] + q->tr[i-1][M2D], // sDG[j] + q->tr[i-1][D2D], //gap extension (DD) in query // bDG[i][j] // ); simd_float mm_dg_vec = simdf32_add(sMM_j, q_m2d); simd_float dg_dg_vec = simdf32_add(sDG_j, q_d2d); //gap extension (DD) in query // if mm_dg > dg_dg { 32 } MAX2_SET_MASK(mm_dg_vec,dg_dg_vec, dg_mm_vec, byte_result_vec); sDG_i_j = simdf32_max( mm_dg_vec , dg_dg_vec ); // sMI_i_j = max2 // ( // sMM[j] + q->tr[i-1][M2M] + t->tr[j][M2I], // MM->MI gap opening M2I in template // sMI[j] + q->tr[i-1][M2M] + t->tr[j][I2I], // MI->MI gap extension I2I in template // bMI[i][j] // ); simd_float mm_mi_vec = simdf32_add( simdf32_add(sMM_j, q_m2m), t_m2i); // MM->MI gap opening M2I in template simd_float mi_mi_vec = simdf32_add( simdf32_add(sMI_j, q_m2m), t_i2i); // MI->MI gap extension I2I in template // if mm_mi > mi_mi { 64 } MAX2_SET_MASK(mm_mi_vec, mi_mi_vec,mi_mm_vec, byte_result_vec); sMI_i_j = simdf32_max( mm_mi_vec, mi_mi_vec ); // Cell of logic // if (cell_off[i][j]) //shift 10000000100000001000000010000000 -> 01000000010000000100000001000000 //because 10000000000000000000000000000000 = -2147483648 kills cmplt #ifdef VITERBI_CELLOFF #ifdef AVX2 // if(((sCO_MI_DG_IM_GD_MM_vec[j] >>1) & 0x4040404040404040) > 0){ // std::cout << ((sCO_MI_DG_IM_GD_MM_vec[j] >>1) & 0x4040404040404040 ) << std::endl; // } simd_int matrix_vec = _mm256_set1_epi64x(sCO_MI_DG_IM_GD_MM_vec[j]>>1); matrix_vec = _mm256_shuffle_epi8(matrix_vec,shuffle_mask_celloff); #else // if(((sCO_MI_DG_IM_GD_MM_vec[j] >>1) & 0x40404040) > 0){ // std::cout << ((sCO_MI_DG_IM_GD_MM_vec[j] >>1) & 0x40404040 ) << std::endl; // } simd_int matrix_vec = simdi32_set(sCO_MI_DG_IM_GD_MM_vec[j]>>1); #endif simd_int cell_off_vec = simdi_and(matrix_vec, co_vec); simd_int res_eq_co_vec = simdi32_gt(co_vec, cell_off_vec ); // shift is because signed can't be checked here simd_float cell_off_float_min_vec = (simd_float) simdi_andnot(res_eq_co_vec, float_min_vec); // inverse // if(((sCO_MI_DG_IM_GD_MM_vec[j] >>1) & 0x4040404040404040) > 0){ // for(int i = 0; i < 8; i++){ // std::cout << i << " " << j << " " << ((float *) &cell_off_float_min_vec )[i] << " "; // } // std::cout << std::endl; // } sMM_i_j = simdf32_add(sMM_i_j,cell_off_float_min_vec); // add the cell off vec to sMM_i_j. Set -FLT_MAX to cell off sGD_i_j = simdf32_add(sGD_i_j,cell_off_float_min_vec); sIM_i_j = simdf32_add(sIM_i_j,cell_off_float_min_vec); sDG_i_j = simdf32_add(sDG_i_j,cell_off_float_min_vec); sMI_i_j = simdf32_add(sMI_i_j,cell_off_float_min_vec); #endif simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 0), sMM_i_j); simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 1), sDG_i_j); simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 2), sMI_i_j); simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 3), sGD_i_j); simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 4), sIM_i_j); // write values back to ViterbiMatrix #ifdef AVX2 /* byte_result_vec 000H 000G 000F 000E 000D 000C 000B 000A */ /* abcdefgh 0000 0000 HGFE 0000 0000 0000 0000 DCBA */ const __m256i abcdefgh = _mm256_shuffle_epi8(byte_result_vec, shuffle_mask_extract); /* abcd 0000 0000 0000 DCBA */ const __m128i abcd = _mm256_castsi256_si128(abcdefgh); /* efgh 0000 0000 HGFE 0000 */ const __m128i efgh = _mm256_extracti128_si256(abcdefgh, 1); _mm_storel_epi64((__m128i*)&sCO_MI_DG_IM_GD_MM_vec[j], _mm_or_si128(abcd, efgh)); #else byte_result_vec = _mm_packs_epi32(byte_result_vec, byte_result_vec); byte_result_vec = _mm_packus_epi16(byte_result_vec, byte_result_vec); int int_result = _mm_cvtsi128_si32(byte_result_vec); sCO_MI_DG_IM_GD_MM_vec[j] = int_result; #endif // Find maximum score; global alignment: maxize only over last row and last column // if(sMM_i_j>score && (par.loc || i==q->L)) { i2=i; j2=j; score=sMM_i_j; } if (findMaxInnerLoop){ // new score is higer // output // 0 0 0 MAX simd_int lookup_mask_hi = (simd_int) simdf32_gt(sMM_i_j,score_vec); // old score is higher // output // MAX MAX MAX 0 simd_int lookup_mask_lo = (simd_int) simdf32_lt(sMM_i_j,score_vec); simd_int curr_pos_j = simdi32_set(j); simd_int new_j_pos_hi = simdi_and(lookup_mask_hi,curr_pos_j); simd_int old_j_pos_lo = simdi_and(lookup_mask_lo,j2_vec); j2_vec = simdi32_add(new_j_pos_hi,old_j_pos_lo); simd_int curr_pos_i = simdi32_set(i); simd_int new_i_pos_hi = simdi_and(lookup_mask_hi,curr_pos_i); simd_int old_i_pos_lo = simdi_and(lookup_mask_lo,i2_vec); i2_vec = simdi32_add(new_i_pos_hi,old_i_pos_lo); score_vec=simdf32_max(sMM_i_j,score_vec); } } //end for j // if global alignment: look for best cell in last column if (!local){ // new score is higer // output // 0 0 0 MAX simd_int lookup_mask_hi = (simd_int) simdf32_gt(sMM_i_j,score_vec); // old score is higher // output // MAX MAX MAX 0 simd_int lookup_mask_lo = (simd_int) simdf32_lt(sMM_i_j,score_vec); simd_int curr_pos_j = simdi32_set(j); simd_int new_j_pos_hi = simdi_and(lookup_mask_hi,curr_pos_j); simd_int old_j_pos_lo = simdi_and(lookup_mask_lo,j2_vec); j2_vec = simdi32_add(new_j_pos_hi,old_j_pos_lo); simd_int curr_pos_i = simdi32_set(i); simd_int new_i_pos_hi = simdi_and(lookup_mask_hi,curr_pos_i); simd_int old_i_pos_lo = simdi_and(lookup_mask_lo,i2_vec); i2_vec = simdi32_add(new_i_pos_hi,old_i_pos_lo); score_vec = simdf32_max(sMM_i_j,score_vec); } // end for j } // end for i for(int seq_index=0; seq_index < maxres; seq_index++){ result->score[seq_index]=((float*)&score_vec)[seq_index]; result->i[seq_index] = ((int*)&i2_vec)[seq_index]; result->j[seq_index] = ((int*)&j2_vec)[seq_index]; } // printf("Template=%-12.12s i=%-4i j=%-4i score=%6.3f\n",t->name,i2,j2,score); }
mlib_status __mlib_VideoP64Loop_S16_U8( mlib_s16 mc_block[64], const mlib_u8 *ref_block, mlib_s32 stride) { const mlib_u8 *sl; mlib_s16 *sd; __m128i txmm0, txmm1, txmm2, txmm3, txmm4, txmm5, txmm6, txmm7; __m128i t0, t1, t2, t3, t4, t5, t6, t7; __m128i Czero, CF, C2, C4, C8; Czero = _mm_setzero_si128(); C2 = _mm_set1_epi16(2); C4 = _mm_set1_epi16(4); C8 = _mm_set1_epi16(8); CF = _mm_set_epi32(0xff0000, 0, 0, 0xff); sd = mc_block; sl = ref_block; LOADL(0); sl += stride; FILTERX(0); LOADL(1); sl += stride; FILTERX(1); STORB(0); sd += 8; ADDL(0, 1); LOADL(2); sl += stride; FILTERX(2); ADDLRND(1, 2); STORSUM(0, 1); sd += 8; LOADL(3); sl += stride; FILTERX(3); ADDL(2, 3); STORSUM(1, 2); sd += 8; LOADL(4); sl += stride; FILTERX(4); ADDLRND(3, 4); STORSUM(2, 3); sd += 8; LOADL(5); sl += stride; FILTERX(5); ADDL(4, 5); STORSUM(3, 4); sd += 8; LOADL(6); sl += stride; FILTERX(6); ADDLRND(5, 6); STORSUM(4, 5); sd += 8; LOADL(7); FILTERX(7); ADDL(6, 7); STORSUM(5, 6); sd += 8; STORB(7); return (MLIB_SUCCESS); }
HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen) { int r; __m128i x0; __m128i x1; __m128i x2; __m128i x3; __m128i x4; __m128i x5; __m128i x6; __m128i x7; __m128i y0; __m128i y1; __m128i y2; __m128i y3; x0 = state->x[0]; x1 = state->x[1]; x2 = state->x[2]; x3 = state->x[3]; x4 = state->x[4]; x5 = state->x[5]; x6 = state->x[6]; x7 = state->x[7]; while (databitlen >= 8) { x0 = _mm_xor_si128(x0,_mm_set_epi32(0,0,0,(int) (unsigned int) *data)); data += 1; databitlen -= 8; for (r = 0;r < CUBEHASH_ROUNDS;++r) { x4 = _mm_add_epi32(x0,x4); x5 = _mm_add_epi32(x1,x5); x6 = _mm_add_epi32(x2,x6); x7 = _mm_add_epi32(x3,x7); y0 = x2; y1 = x3; y2 = x0; y3 = x1; x0 = _mm_xor_si128(_mm_slli_epi32(y0,7),_mm_srli_epi32(y0,25)); x1 = _mm_xor_si128(_mm_slli_epi32(y1,7),_mm_srli_epi32(y1,25)); x2 = _mm_xor_si128(_mm_slli_epi32(y2,7),_mm_srli_epi32(y2,25)); x3 = _mm_xor_si128(_mm_slli_epi32(y3,7),_mm_srli_epi32(y3,25)); x0 = _mm_xor_si128(x0,x4); x1 = _mm_xor_si128(x1,x5); x2 = _mm_xor_si128(x2,x6); x3 = _mm_xor_si128(x3,x7); x4 = _mm_shuffle_epi32(x4,0x4e); x5 = _mm_shuffle_epi32(x5,0x4e); x6 = _mm_shuffle_epi32(x6,0x4e); x7 = _mm_shuffle_epi32(x7,0x4e); x4 = _mm_add_epi32(x0,x4); x5 = _mm_add_epi32(x1,x5); x6 = _mm_add_epi32(x2,x6); x7 = _mm_add_epi32(x3,x7); y0 = x1; y1 = x0; y2 = x3; y3 = x2; x0 = _mm_xor_si128(_mm_slli_epi32(y0,11),_mm_srli_epi32(y0,21)); x1 = _mm_xor_si128(_mm_slli_epi32(y1,11),_mm_srli_epi32(y1,21)); x2 = _mm_xor_si128(_mm_slli_epi32(y2,11),_mm_srli_epi32(y2,21)); x3 = _mm_xor_si128(_mm_slli_epi32(y3,11),_mm_srli_epi32(y3,21)); x0 = _mm_xor_si128(x0,x4); x1 = _mm_xor_si128(x1,x5); x2 = _mm_xor_si128(x2,x6); x3 = _mm_xor_si128(x3,x7); x4 = _mm_shuffle_epi32(x4,0xb1); x5 = _mm_shuffle_epi32(x5,0xb1); x6 = _mm_shuffle_epi32(x6,0xb1); x7 = _mm_shuffle_epi32(x7,0xb1); } } state->x[0] = x0; state->x[1] = x1; state->x[2] = x2; state->x[3] = x3; state->x[4] = x4; state->x[5] = x5; state->x[6] = x6; state->x[7] = x7; if (databitlen > 0) { ((unsigned char *) state->x)[state->pos / 8] ^= *data; state->pos += databitlen; } return SUCCESS; }
void minmax_vec2(const uint32_t n, float const* buf, uint32_t* idx_min_, uint32_t* idx_max_, float* min_, float* max_) { // We suppose that pointers are aligned on an 16-byte boundary // Initialise SSE registers __m128i sse_idx_min = _mm_setzero_si128(); __m128i sse_idx_max = _mm_setzero_si128(); __m128 sse_min = _mm_set1_ps(FLT_MAX); __m128 sse_max = _mm_set1_ps(FLT_MIN); // We will unroll the for-loop by for, thus doing // (n/4) iterations. const uint32_t n_sse = n & ~3ULL; __m128i sse_idx = _mm_set_epi32(3, 2, 1, 0); const __m128i sse_4 = _mm_set1_epi32(4); for (uint32_t i = 0; i < n_sse; i += 4) { const __m128 sse_v = _mm_load_ps(&buf[i]); const __m128 sse_cmp_min = _mm_cmplt_ps(sse_v, sse_min); const __m128 sse_cmp_max = _mm_cmpgt_ps(sse_v, sse_max); sse_min = _mm_blendv_ps(sse_min, sse_v, sse_cmp_min); sse_max = _mm_blendv_ps(sse_max, sse_v, sse_cmp_max); sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx, (__m128) sse_cmp_min); sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx, (__m128) sse_cmp_max); sse_idx = _mm_add_epi32(sse_idx, sse_4); } // SSE reduction __m128 sse_min_permute = _mm_shuffle_epi32(sse_min, 2 | (3<<2)); __m128 sse_max_permute = _mm_shuffle_epi32(sse_max, 2 | (3<<2)); __m128i sse_idx_min_permute = _mm_shuffle_epi32(sse_idx_min, 2 | (3<<2)); __m128i sse_idx_max_permute = _mm_shuffle_epi32(sse_idx_max, 2 | (3<<2)); __m128 sse_cmp_min = _mm_cmplt_ps(sse_min_permute, sse_min); __m128 sse_cmp_max = _mm_cmpgt_ps(sse_max_permute, sse_max); sse_min = _mm_blendv_ps(sse_min, sse_min_permute, sse_cmp_min); sse_max = _mm_blendv_ps(sse_max, sse_max_permute, sse_cmp_max); sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx_min_permute, (__m128) sse_cmp_min); sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx_max_permute, (__m128) sse_cmp_max); sse_min_permute = _mm_shuffle_epi32(sse_min, 1); sse_max_permute = _mm_shuffle_epi32(sse_max, 1); sse_idx_min_permute = _mm_shuffle_epi32(sse_idx_min, 1); sse_idx_max_permute = _mm_shuffle_epi32(sse_idx_max, 1); sse_cmp_min = _mm_cmplt_ps(sse_min_permute, sse_min); sse_cmp_max = _mm_cmpgt_ps(sse_max_permute, sse_max); sse_min = _mm_blendv_ps(sse_min, sse_min_permute, sse_cmp_min); sse_max = _mm_blendv_ps(sse_max, sse_max_permute, sse_cmp_max); sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx_min_permute, (__m128) sse_cmp_min); sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx_max_permute, (__m128) sse_cmp_max); // Epilogue float min, max; uint32_t idx_min, idx_max; _mm_store_ss(&min, sse_min); _mm_store_ss(&max, sse_max); idx_min = _mm_extract_epi32(sse_idx_min, 0); idx_max = _mm_extract_epi32(sse_idx_max, 0); for (uint32_t i = n_sse; i < n; i++) { const float v = buf[i]; if (v < min) { min = v; idx_min = i; } if (v > max) { max = v; idx_max = i; } } *idx_min_ = idx_min; *min_ = min; *idx_max_ = idx_max; *max_ = max; }
HashReturn Final(hashState *state, BitSequence *hashval) { __m128i remainingbits; // Add remaining bytes in the buffer state->processed_bits += state->uBufferBytes * 8; remainingbits = _mm_set_epi32(0, 0, 0, state->uBufferBytes * 8); // Pad with 0x80 state->buffer[state->uBufferBytes++] = 0x80; // Enough buffer space for padding in this block? if((state->uBlockLength - state->uBufferBytes) >= 18) { // Pad with zeros memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - (state->uBufferBytes + 18)); // Hash size *((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize; // Processed bits *((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits; *((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0; // Last block contains message bits? if(state->uBufferBytes == 1) { state->k = _mm_xor_si128(state->k, state->k); state->k = _mm_sub_epi64(state->k, state->const1536); } else { state->k = _mm_add_epi64(state->k, remainingbits); state->k = _mm_sub_epi64(state->k, state->const1536); } // Compress Compress(state, state->buffer, 1); } else { // Fill with zero and compress memset(state->buffer + state->uBufferBytes, 0, state->uBlockLength - state->uBufferBytes); state->k = _mm_add_epi64(state->k, remainingbits); state->k = _mm_sub_epi64(state->k, state->const1536); Compress(state, state->buffer, 1); // Last block memset(state->buffer, 0, state->uBlockLength - 18); // Hash size *((unsigned short*)(state->buffer + state->uBlockLength - 18)) = state->uHashSize; // Processed bits *((DataLength*)(state->buffer + state->uBlockLength - 16)) = state->processed_bits; *((DataLength*)(state->buffer + state->uBlockLength - 8)) = 0; // Compress the last block state->k = _mm_xor_si128(state->k, state->k); state->k = _mm_sub_epi64(state->k, state->const1536); Compress(state, state->buffer, 1); } // Store the hash value _mm_storeu_si128((__m128i*)hashval + 0, state->state[0][0]); _mm_storeu_si128((__m128i*)hashval + 1, state->state[1][0]); if(state->uHashSize == 512) { _mm_storeu_si128((__m128i*)hashval + 2, state->state[2][0]); _mm_storeu_si128((__m128i*)hashval + 3, state->state[3][0]); } return SUCCESS; }
int32_t similar = NEG_LIMIT; int32_t length = NEG_LIMIT; __m128i vNegLimit = _mm_set1_epi32(NEG_LIMIT); __m128i vPosLimit = _mm_set1_epi32(POS_LIMIT); __m128i vSaturationCheckMin = vPosLimit; __m128i vSaturationCheckMax = vNegLimit; __m128i vNegInf = _mm_set1_epi32(NEG_LIMIT); __m128i vOpen = _mm_set1_epi32(open); __m128i vGap = _mm_set1_epi32(gap); __m128i vZero = _mm_set1_epi32(0); __m128i vNegInf0 = _mm_insert_epi32_rpl(vZero, NEG_LIMIT, 3); __m128i vOne = _mm_set1_epi32(1); __m128i vN = _mm_set1_epi32(N); __m128i vGapN = _mm_set1_epi32(gap*N); __m128i vNegOne = _mm_set1_epi32(-1); __m128i vI = _mm_set_epi32(0,1,2,3); __m128i vJreset = _mm_set_epi32(0,-1,-2,-3); __m128i vMaxH = vNegInf; __m128i vMaxM = vNegInf; __m128i vMaxS = vNegInf; __m128i vMaxL = vNegInf; __m128i vILimit = _mm_set1_epi32(s1Len); __m128i vILimit1 = _mm_sub_epi32(vILimit, vOne); __m128i vJLimit = _mm_set1_epi32(s2Len); __m128i vJLimit1 = _mm_sub_epi32(vJLimit, vOne); __m128i vIBoundary = _mm_set_epi32( -open-0*gap, -open-1*gap, -open-2*gap, -open-3*gap);