// handle 1024 pixels (32x32, 16x64, 64x16) static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum, unsigned int *const sse) { // extract the low lane and add it to the high lane const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum); const __m128i vsum_64 = _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128), _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8))); return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse); }
// 駒割り以外の全計算 // pos.st->BKPP,WKPP,KPPを初期化する。Position::set()で一度だけ呼び出される。(以降は差分計算) // 手番側から見た評価値を返すので注意。(他の評価関数とは設計がこの点において異なる) Value compute_eval(const Position& pos) { Square sq_bk = pos.king_square(BLACK); Square sq_wk = pos.king_square(WHITE); const auto* ppkppb = kpp[sq_bk]; const auto* ppkppw = kpp[Inv(sq_wk)]; auto& pos_ = *const_cast<Position*>(&pos); auto list_fb = pos_.eval_list()->piece_list_fb(); auto list_fw = pos_.eval_list()->piece_list_fw(); int i, j; BonaPiece k0, k1,l0,l1; // 評価値の合計 EvalSum sum; // SSE2は少なくとも有るという前提で。 // sum.p[0](BKPP)とsum.p[1](WKPP)をゼロクリア sum.m[0] = _mm_setzero_si128(); // KK sum.p[2] = kk[sq_bk][sq_wk]; for (i = 0; i < PIECE_NO_KING; ++i) { k0 = list_fb[i]; k1 = list_fw[i]; const auto* pkppb = ppkppb[k0]; const auto* pkppw = ppkppw[k1]; for (j = 0; j < i; ++j) { l0 = list_fb[j]; l1 = list_fw[j]; #if 0 sum.p[0] += pkppb[l0]; sum.p[1] += pkppw[l1]; #else // SSEによる実装 // pkppw[l1][0],pkppw[l1][1],pkppb[l0][0],pkppb[l0][1]の16bit変数4つを整数拡張で32bit化して足し合わせる __m128i tmp; tmp = _mm_set_epi32(0, 0, *reinterpret_cast<const int32_t*>(&pkppw[l1][0]), *reinterpret_cast<const int32_t*>(&pkppb[l0][0])); tmp = _mm_cvtepi16_epi32(tmp); sum.m[0] = _mm_add_epi32(sum.m[0], tmp); #endif } sum.p[2] += kkp[sq_bk][sq_wk][k0]; } auto& info = *pos.state(); info.sum = sum; sum.p[2][0] += pos.state()->materialValue * FV_SCALE; return Value(sum.sum(pos.side_to_move()) / FV_SCALE); }
// 評価関数が正しいかどうかを判定するのに使う Value Position::evaluate_correct(const Color us) const { int list0[PIECENUMBER_MAX + 1]; //駒番号numのlist0 int list1[PIECENUMBER_MAX + 1]; //駒番号numのlist1 int nlist = make_list_correct(list0, list1); const int sq_bk = SQ_BKING; const int sq_wk = SQ_WKING; const auto* ppkppb = Evaluater::KPP[sq_bk]; const auto* ppkppw = Evaluater::KPP[Inv(sq_wk)]; EvalSum score; score.p[2] = Evaluater::KK[sq_bk][sq_wk]; #if defined USE_AVX2_EVAL || defined USE_SSE_EVAL score.m[0] = _mm_setzero_si128(); for (int i = 0; i < nlist; ++i) { const int k0 = list0[i]; const int k1 = list1[i]; const auto* pkppb = ppkppb[k0]; const auto* pkppw = ppkppw[k1]; for (int j = 0; j < i; ++j) { const int l0 = list0[j]; const int l1 = list1[j]; __m128i tmp; tmp = _mm_set_epi32(0, 0, *reinterpret_cast<const int32_t*>(&pkppw[l1][0]), *reinterpret_cast<const int32_t*>(&pkppb[l0][0])); tmp = _mm_cvtepi16_epi32(tmp); score.m[0] = _mm_add_epi32(score.m[0], tmp); } score.p[2] += Evaluater::KKP[sq_bk][sq_wk][k0]; } #else score.p[0][0] = 0; score.p[0][1] = 0; score.p[1][0] = 0; score.p[1][1] = 0; for (int i = 0; i < nlist; i++ ) { const int k0 = list0[i]; const int k1 = list1[i]; assert(0 <= k0 && k0 < fe_end); assert(0 <= k1 && k1 < fe_end); const auto* pkppb = ppkppb[k0]; const auto* pkppw = ppkppw[k1]; for (int j = 0; j < i; j++ ) { const int l0 = list0[j]; const int l1 = list1[j]; assert(0 <= l0 && l0 < fe_end); assert(0 <= l1 && l1 < fe_end); score.p[0] += pkppb[l0]; score.p[1] += pkppw[l1]; } score.p[2] += Evaluater::KKP[sq_bk][sq_wk][k0]; } #endif score.p[2][0] += MATERIAL * FV_SCALE; return Value(score.sum(us) / FV_SCALE); }
Value Position::evaluate(const Color us, SearchStack* ss) { const int sq_bk = SQ_BKING; const int sq_wk = SQ_WKING; assert(0 <= sq_bk && sq_bk < nsquare); assert(0 <= sq_wk && sq_wk < nsquare); const auto* ppkppb = Evaluater::KPP[sq_bk ]; const auto* ppkppw = Evaluater::KPP[Inv(sq_wk)]; EvalSum score; score.p[2] = Evaluater::KK[sq_bk][sq_wk]; #if defined USE_AVX2_EVAL || defined USE_SSE_EVAL score.m[0] = _mm_setzero_si128(); for (int kn = PIECENUMBER_MIN; kn <= PIECENUMBER_MAX; kn++) { const int k0 = list0[kn]; const int k1 = list1[kn]; const auto* pkppb = ppkppb[k0]; const auto* pkppw = ppkppw[k1]; for (int j = PIECENUMBER_MIN; j < kn; j++) { const int l0 = list0[j]; const int l1 = list1[j]; __m128i tmp; tmp = _mm_set_epi32(0, 0, *reinterpret_cast<const int32_t*>(&pkppw[l1][0]), *reinterpret_cast<const int32_t*>(&pkppb[l0][0])); tmp = _mm_cvtepi16_epi32(tmp); score.m[0] = _mm_add_epi32(score.m[0], tmp); } score.p[2] += Evaluater::KKP[sq_bk][sq_wk][k0]; } #else score.p[0][0] = 0; score.p[0][1] = 0; score.p[1][0] = 0; score.p[1][1] = 0; for (int i = 0; i < nlist; i++ ) { const int k0 = list0[i]; const int k1 = list1[i]; assert(0 <= k0 && k0 < fe_end); assert(0 <= k1 && k1 < fe_end); const auto* pkppb = ppkppb[k0]; const auto* pkppw = ppkppw[k1]; for (int j = 0; j < i; j++ ) { const int l0 = list0[j]; const int l1 = list1[j]; assert(0 <= l0 && l0 < fe_end); assert(0 <= l1 && l1 < fe_end); score.p[0] += pkppb[l0]; score.p[1] += pkppw[l1]; } score.p[2] += Evaluater::KKP[sq_bk][sq_wk][k0]; } #endif score.p[2][0] += MATERIAL * FV_SCALE; return Value(score.sum(us) / FV_SCALE); }
inline __m128i load_aligned_int32(const int16_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i res = _mm_cvtepi16_epi32(tmp); #else __m128i mask = _mm_cmplt_epi16(tmp, _mm_set1_epi16(0)); __m128i res = _mm_unpacklo_epi16(tmp, mask); #endif return res; }
void LOADERDECL TexCoord_ReadIndex_Short2_SSE4() { static_assert(!std::numeric_limits<I>::is_signed, "Only unsigned I is sane!"); // Heavy in ZWW auto const index = DataRead<I>(); const s32 *pData = (const s32*)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0+tcIndex])); const __m128i a = _mm_cvtsi32_si128(*pData); const __m128i b = _mm_shuffle_epi8(a, kMaskSwap16_2); const __m128i c = _mm_cvtepi16_epi32(b); const __m128 d = _mm_cvtepi32_ps(c); const __m128 e = _mm_load1_ps(&tcScale[tcIndex]); const __m128 f = _mm_mul_ps(d, e); _mm_storeu_ps((float*)VertexManager::s_pCurBufferPointer, f); VertexManager::s_pCurBufferPointer += sizeof(float) * 2; LOG_TEX<2>(); tcIndex++; }
static void sse4_1_test (void) { union { __m128i x[NUM / 4]; int i[NUM]; short s[NUM * 2]; } dst, src; int i, sign = 1; for (i = 0; i < NUM; i++) { src.s[(i % 4) + (i / 4) * 8] = i * i * sign; sign = -sign; } for (i = 0; i < NUM; i += 4) dst.x [i / 4] = _mm_cvtepi16_epi32 (src.x [i / 4]); for (i = 0; i < NUM; i++) if (src.s[(i % 4) + (i / 4) * 8] != dst.i[i]) abort (); }
// 現在の局面の評価値の内訳を表示する。 void print_eval_stat(Position& pos) { cout << "--- EVAL STAT\n"; Square sq_bk = pos.king_square(BLACK); Square sq_wk = pos.king_square(WHITE); const auto* ppkppb = kpp[sq_bk]; const auto* ppkppw = kpp[Inv(sq_wk)]; auto& pos_ = *const_cast<Position*>(&pos); auto list_fb = pos_.eval_list()->piece_list_fb(); auto list_fw = pos_.eval_list()->piece_list_fw(); int i, j; BonaPiece k0, k1, l0, l1; // 38枚の駒を表示 for (i = 0; i < PIECE_NO_KING; ++i) cout << int(list_fb[i]) << " = " << list_fb[i] << " , " << int(list_fw[i]) << " = " << list_fw[i] << endl; // 評価値の合計 EvalSum sum; // SSE2は少なくとも有るという前提で。 // sum.p[0](BKPP)とsum.p[1](WKPP)をゼロクリア sum.m[0] = _mm_setzero_si128(); // KK sum.p[2] = kk[sq_bk][sq_wk]; cout << "KKC : " << sq_bk << " " << sq_wk << " = " << kk[sq_bk][sq_wk][0] << " + " << kk[sq_bk][sq_wk][1] << "\n"; for (i = 0; i < PIECE_NO_KING; ++i) { k0 = list_fb[i]; k1 = list_fw[i]; const auto* pkppb = ppkppb[k0]; const auto* pkppw = ppkppw[k1]; for (j = 0; j < i; ++j) { l0 = list_fb[j]; l1 = list_fw[j]; #if 0 sum.p[0] += pkppb[l0]; sum.p[1] += pkppw[l1]; #else // SSEによる実装 // pkppw[l1][0],pkppw[l1][1],pkppb[l0][0],pkppb[l0][1]の16bit変数4つを整数拡張で32bit化して足し合わせる __m128i tmp; tmp = _mm_set_epi32(0, 0, *reinterpret_cast<const int32_t*>(&pkppw[l1][0]), *reinterpret_cast<const int32_t*>(&pkppb[l0][0])); tmp = _mm_cvtepi16_epi32(tmp); sum.m[0] = _mm_add_epi32(sum.m[0], tmp); cout << "BKPP : " << sq_bk << " " << k0 << " " << l0 << " = " << pkppb[l0][0] << " + " << pkppb[l0][1] << "\n"; cout << "WKPP : " << sq_wk << " " << k1 << " " << l1 << " = " << pkppw[l1][0] << " + " << pkppw[l1][1] << "\n"; #endif } sum.p[2] += kkp[sq_bk][sq_wk][k0]; cout << "KKP : " << sq_bk << " " << sq_wk << " " << k0 << " = " << kkp[sq_bk][sq_wk][k0][0] << " + " << kkp[sq_bk][sq_wk][k0][1] << "\n"; } cout << "Material = " << pos.state()->materialValue << endl; cout << sum; cout << "---\n"; }
__m128i test_mm_cvtepi16_epi32(__m128i a) { // CHECK-LABEL: test_mm_cvtepi16_epi32 // CHECK: call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> {{.*}}) // CHECK-ASM: pmovsxwd %xmm{{.*}}, %xmm{{.*}} return _mm_cvtepi16_epi32(a); }
__m128i test_mm_cvtepi16_epi32(__m128i a) { // CHECK-LABEL: test_mm_cvtepi16_epi32 // CHECK: shufflevector <8 x i16> {{.*}}, <8 x i16> {{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3> // CHECK: sext <4 x i16> {{.*}} to <4 x i32> return _mm_cvtepi16_epi32(a); }
int Position::evaluate(const Color us) const { int list0[NLIST], list1[NLIST]; int sq_bk, sq_wk; #ifndef TWIG int score; #else EvalSum score; #endif static int count=0; count++; int nlist=0; // 持ち駒をリスト化する #define FOO(hand, Piece, list0_index, list1_index) \ for (int i = I2Hand##Piece(hand); i >= 1; --i) { \ list0[nlist] = list0_index + i; \ list1[nlist] = list1_index + i; \ ++nlist; \ } FOO(HAND_B, Pawn , f_hand_pawn , e_hand_pawn ) FOO(HAND_W, Pawn , e_hand_pawn , f_hand_pawn ) FOO(HAND_B, Lance , f_hand_lance , e_hand_lance ) FOO(HAND_W, Lance , e_hand_lance , f_hand_lance ) FOO(HAND_B, Knight, f_hand_knight, e_hand_knight) FOO(HAND_W, Knight, e_hand_knight, f_hand_knight) FOO(HAND_B, Silver, f_hand_silver, e_hand_silver) FOO(HAND_W, Silver, e_hand_silver, f_hand_silver) FOO(HAND_B, Gold , f_hand_gold , e_hand_gold ) FOO(HAND_W, Gold , e_hand_gold , f_hand_gold ) FOO(HAND_B, Bishop, f_hand_bishop, e_hand_bishop) FOO(HAND_W, Bishop, e_hand_bishop, f_hand_bishop) FOO(HAND_B, Rook , f_hand_rook , e_hand_rook ) FOO(HAND_W, Rook , e_hand_rook , f_hand_rook ) #undef FOO nlist = make_list_apery(list0, list1, nlist); sq_bk = SQ_BKING; sq_wk = SQ_WKING; assert(0 <= sq_bk && sq_bk < nsquare); assert(0 <= sq_wk && sq_wk < nsquare); const auto* ppkppb = KPP[sq_bk ]; const auto* ppkppw = KPP[Inv(sq_wk)]; #ifndef TWIG score = fv_kk[sq_bk][sq_wk]; for (int i = 0; i < nlist; i++ ) { const int k0 = list0[i]; const int k1 = list1[i]; assert(0 <= k0 && k0 < fe_end); assert(0 <= k1 && k1 < fe_end); const auto* pkppb = ppkppb[k0]; const auto* pkppw = ppkppw[k1]; for (int j = 0; j < i; j++ ) { const int l0 = list0[j]; const int l1 = list1[j]; assert(0 <= l0 && l0 < fe_end); assert(0 <= l1 && l1 < fe_end); score += pkppb[l0]; score -= pkppw[l1]; } score += fv_kkp[sq_bk][sq_wk][k0]; } score += MATERIAL * FV_SCALE; score /= FV_SCALE; score = (us == BLACK) ? score : -score; return score; #else EvalSum sum; sum.p[2] = KK[sq_bk][sq_wk]; #if defined USE_AVX2_EVAL || defined USE_SSE_EVAL sum.m[0] = _mm_setzero_si128(); for (int i = 0; i < nlist; ++i) { const int k0 = list0[i]; const int k1 = list1[i]; const auto* pkppb = ppkppb[k0]; const auto* pkppw = ppkppw[k1]; for (int j = 0; j < i; ++j) { const int l0 = list0[j]; const int l1 = list1[j]; __m128i tmp; tmp = _mm_set_epi32(0, 0, *reinterpret_cast<const int32_t*>(&pkppw[l1][0]), *reinterpret_cast<const int32_t*>(&pkppb[l0][0])); tmp = _mm_cvtepi16_epi32(tmp); sum.m[0] = _mm_add_epi32(sum.m[0], tmp); } sum.p[2] += KKP[sq_bk][sq_wk][k0]; } sum.p[2][0] += MATERIAL * FV_SCALE; #else // loop 開始を i = 1 からにして、i = 0 の分のKKPを先に足す。 sum.p[2] += KKP[sq_bk][sq_wk][list0[0]]; sum.p[0][0] = 0; sum.p[0][1] = 0; sum.p[1][0] = 0; sum.p[1][1] = 0; for (int i = 1; i < nlist; ++i) { const int k0 = list0[i]; const int k1 = list1[i]; const auto* pkppb = ppkppb[k0]; const auto* pkppw = ppkppw[k1]; for (int j = 0; j < i; ++j) { const int l0 = list0[j]; const int l1 = list1[j]; sum.p[0] += pkppb[l0]; sum.p[1] += pkppw[l1]; } sum.p[2] += KKP[sq_bk][sq_wk][k0]; } sum.p[2][0] += MATERIAL * FV_SCALE; #endif #ifdef _DEBUG score.p[2] = KK[sq_bk][sq_wk]; score.p[0][0] = 0; score.p[0][1] = 0; score.p[1][0] = 0; score.p[1][1] = 0; for (int i = 0; i < nlist; ++i) { const int k0 = list0[i]; const int k1 = list1[i]; const auto* pkppb = ppkppb[k0]; const auto* pkppw = ppkppw[k1]; for (int j = 0; j < i; ++j) { const int l0 = list0[j]; const int l1 = list1[j]; score.p[0][0] += pkppb[l0][0]; score.p[0][1] += pkppb[l0][1]; score.p[1][0] += pkppw[l1][0]; score.p[1][1] += pkppw[l1][1]; } score.p[2][0] += KKP[sq_bk][sq_wk][k0][0]; score.p[2][1] += KKP[sq_bk][sq_wk][k0][1]; } score.p[2][0] += MATERIAL * FV_SCALE; assert(score.sum(us) == sum.sum(us)); #endif return sum.sum(us) / FV_SCALE ; #endif }