C++ (Cpp) _mm_cvtepi16_epi32 예제들

예제 #1

0

파일 보기

파일: variance_avx2.c 프로젝트: jfiguinha/Regards

// handle 1024 pixels (32x32, 16x64, 64x16)
static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum,
                                           unsigned int *const sse) {
  // extract the low lane and add it to the high lane
  const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
  const __m128i vsum_64 =
      _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128),
                    _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8)));
  return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse);
}

예제 #2

0

파일 보기

파일: evaluate_kppt.cpp 프로젝트: bottlenome/YaneuraOu

  // 駒割り以外の全計算
  // pos.st->BKPP,WKPP,KPPを初期化する。Position::set()で一度だけ呼び出される。(以降は差分計算)
  // 手番側から見た評価値を返すので注意。(他の評価関数とは設計がこの点において異なる)
  Value compute_eval(const Position& pos)
  {
    Square sq_bk = pos.king_square(BLACK);
    Square sq_wk = pos.king_square(WHITE);
    const auto* ppkppb = kpp[sq_bk];
    const auto* ppkppw = kpp[Inv(sq_wk)];

    auto& pos_ = *const_cast<Position*>(&pos);

    auto list_fb = pos_.eval_list()->piece_list_fb();
    auto list_fw = pos_.eval_list()->piece_list_fw();

    int i, j;
    BonaPiece k0, k1,l0,l1;

    // 評価値の合計
    EvalSum sum;

    // SSE2は少なくとも有るという前提で。

    // sum.p[0](BKPP)とsum.p[1](WKPP)をゼロクリア
    sum.m[0] = _mm_setzero_si128();

    // KK
    sum.p[2] = kk[sq_bk][sq_wk];

    for (i = 0; i < PIECE_NO_KING; ++i)
    {
      k0 = list_fb[i];
      k1 = list_fw[i];
      const auto* pkppb = ppkppb[k0];
      const auto* pkppw = ppkppw[k1];
      for (j = 0; j < i; ++j)
      {
        l0 = list_fb[j];
        l1 = list_fw[j];

#if 0
        sum.p[0] += pkppb[l0];
        sum.p[1] += pkppw[l1];
#else
        // SSEによる実装

        // pkppw[l1][0],pkppw[l1][1],pkppb[l0][0],pkppb[l0][1]の16bit変数4つを整数拡張で32bit化して足し合わせる
        __m128i tmp;
        tmp = _mm_set_epi32(0, 0, *reinterpret_cast<const int32_t*>(&pkppw[l1][0]), *reinterpret_cast<const int32_t*>(&pkppb[l0][0]));
        tmp = _mm_cvtepi16_epi32(tmp);
        sum.m[0] = _mm_add_epi32(sum.m[0], tmp);
#endif
      }
      sum.p[2] += kkp[sq_bk][sq_wk][k0];
    }

    auto& info = *pos.state();
    info.sum = sum;

    sum.p[2][0] += pos.state()->materialValue * FV_SCALE;

    return Value(sum.sum(pos.side_to_move()) / FV_SCALE);
  }

예제 #3

0

파일 보기

파일: evaluate.cpp 프로젝트: Jangja/saya_chan

// 評価関数が正しいかどうかを判定するのに使う
Value Position::evaluate_correct(const Color us) const
{
    int list0[PIECENUMBER_MAX + 1]; //駒番号numのlist0
    int list1[PIECENUMBER_MAX + 1]; //駒番号numのlist1
    int nlist = make_list_correct(list0, list1);

    const int sq_bk = SQ_BKING;
    const int sq_wk = SQ_WKING;
    const auto* ppkppb = Evaluater::KPP[sq_bk];
    const auto* ppkppw = Evaluater::KPP[Inv(sq_wk)];

    EvalSum score;
    score.p[2] = Evaluater::KK[sq_bk][sq_wk];
#if defined USE_AVX2_EVAL || defined USE_SSE_EVAL
    score.m[0] = _mm_setzero_si128();
    for (int i = 0; i < nlist; ++i) {
      const int k0 = list0[i];
      const int k1 = list1[i];
      const auto* pkppb = ppkppb[k0];
      const auto* pkppw = ppkppw[k1];
      for (int j = 0; j < i; ++j) {
        const int l0 = list0[j];
        const int l1 = list1[j];
        __m128i tmp;
        tmp = _mm_set_epi32(0, 0, *reinterpret_cast<const int32_t*>(&pkppw[l1][0]), *reinterpret_cast<const int32_t*>(&pkppb[l0][0]));
        tmp = _mm_cvtepi16_epi32(tmp);
        score.m[0] = _mm_add_epi32(score.m[0], tmp);
      }
      score.p[2] += Evaluater::KKP[sq_bk][sq_wk][k0];
    }
#else
    score.p[0][0] = 0;
    score.p[0][1] = 0;
    score.p[1][0] = 0;
    score.p[1][1] = 0;
	for (int i = 0; i < nlist; i++ ) {
		const int k0 = list0[i];
		const int k1 = list1[i];
		assert(0 <= k0 && k0 < fe_end);
		assert(0 <= k1 && k1 < fe_end);
		const auto* pkppb = ppkppb[k0];
		const auto* pkppw = ppkppw[k1];
		for (int j = 0; j < i; j++ ) {
			const int l0 = list0[j];
			const int l1 = list1[j];
			assert(0 <= l0 && l0 < fe_end);
			assert(0 <= l1 && l1 < fe_end);
            score.p[0] += pkppb[l0];
            score.p[1] += pkppw[l1];
		}
        score.p[2] += Evaluater::KKP[sq_bk][sq_wk][k0];
	}
#endif
    score.p[2][0] += MATERIAL * FV_SCALE;
    return Value(score.sum(us) / FV_SCALE);
}

예제 #4

0

파일 보기

파일: evaluate.cpp 프로젝트: Jangja/saya_chan

Value Position::evaluate(const Color us, SearchStack* ss)
{
	const int sq_bk = SQ_BKING;
	const int sq_wk = SQ_WKING;
	assert(0 <= sq_bk && sq_bk < nsquare);
	assert(0 <= sq_wk && sq_wk < nsquare);
	const auto* ppkppb = Evaluater::KPP[sq_bk     ];
	const auto* ppkppw = Evaluater::KPP[Inv(sq_wk)];

    EvalSum score;
    score.p[2] = Evaluater::KK[sq_bk][sq_wk];
#if defined USE_AVX2_EVAL || defined USE_SSE_EVAL
    score.m[0] = _mm_setzero_si128();
    for (int kn = PIECENUMBER_MIN; kn <= PIECENUMBER_MAX; kn++) {
      const int k0 = list0[kn];
      const int k1 = list1[kn];
      const auto* pkppb = ppkppb[k0];
      const auto* pkppw = ppkppw[k1];
      for (int j = PIECENUMBER_MIN; j < kn; j++) {
        const int l0 = list0[j];
        const int l1 = list1[j];
        __m128i tmp;
        tmp = _mm_set_epi32(0, 0, *reinterpret_cast<const int32_t*>(&pkppw[l1][0]), *reinterpret_cast<const int32_t*>(&pkppb[l0][0]));
        tmp = _mm_cvtepi16_epi32(tmp);
        score.m[0] = _mm_add_epi32(score.m[0], tmp);
      }
      score.p[2] += Evaluater::KKP[sq_bk][sq_wk][k0];
    }
#else
    score.p[0][0] = 0;
    score.p[0][1] = 0;
    score.p[1][0] = 0;
    score.p[1][1] = 0;
	for (int i = 0; i < nlist; i++ ) {
		const int k0 = list0[i];
		const int k1 = list1[i];
		assert(0 <= k0 && k0 < fe_end);
		assert(0 <= k1 && k1 < fe_end);
		const auto* pkppb = ppkppb[k0];
		const auto* pkppw = ppkppw[k1];
		for (int j = 0; j < i; j++ ) {
			const int l0 = list0[j];
			const int l1 = list1[j];
			assert(0 <= l0 && l0 < fe_end);
			assert(0 <= l1 && l1 < fe_end);
            score.p[0] += pkppb[l0];
            score.p[1] += pkppw[l1];
		}
        score.p[2] += Evaluater::KKP[sq_bk][sq_wk][k0];
	}
#endif
    score.p[2][0] += MATERIAL * FV_SCALE;
    return Value(score.sum(us) / FV_SCALE);
}

예제 #5

0

파일 보기

파일: xsimd_sse_int32.hpp 프로젝트: jmabille/nxsimd

        inline __m128i load_aligned_int32(const int16_t* src)
        {

            __m128i tmp = _mm_loadl_epi64((const __m128i*)src);
#if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION
            __m128i res = _mm_cvtepi16_epi32(tmp);
#else
            __m128i mask = _mm_cmplt_epi16(tmp, _mm_set1_epi16(0));
            __m128i res = _mm_unpacklo_epi16(tmp, mask);
#endif
            return res;
        }

예제 #6

0

파일 보기

파일: VertexLoader_TextCoord.cpp 프로젝트: Asmodean-/dolphin

void LOADERDECL TexCoord_ReadIndex_Short2_SSE4()
{
	static_assert(!std::numeric_limits<I>::is_signed, "Only unsigned I is sane!");

	// Heavy in ZWW
	auto const index = DataRead<I>();
	const s32 *pData = (const s32*)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0+tcIndex]));
	const __m128i a = _mm_cvtsi32_si128(*pData);
	const __m128i b = _mm_shuffle_epi8(a, kMaskSwap16_2);
	const __m128i c = _mm_cvtepi16_epi32(b);
	const __m128 d = _mm_cvtepi32_ps(c);
	const __m128 e = _mm_load1_ps(&tcScale[tcIndex]);
	const __m128 f = _mm_mul_ps(d, e);
	_mm_storeu_ps((float*)VertexManager::s_pCurBufferPointer, f);
	VertexManager::s_pCurBufferPointer += sizeof(float) * 2;
	LOG_TEX<2>();
	tcIndex++;
}

예제 #7

0

파일 보기

파일: sse4_1-pmovsxwd.c 프로젝트: IntegerCompany/linaro-android-gcc

static void
sse4_1_test (void)
{
  union
    {
      __m128i x[NUM / 4];
      int i[NUM];
      short s[NUM * 2];
    } dst, src;
  int i, sign = 1;

  for (i = 0; i < NUM; i++)
    {
      src.s[(i % 4) + (i / 4) * 8] = i * i * sign;
      sign = -sign;
    }

  for (i = 0; i < NUM; i += 4)
    dst.x [i / 4] = _mm_cvtepi16_epi32 (src.x [i / 4]);

  for (i = 0; i < NUM; i++)
    if (src.s[(i % 4) + (i / 4) * 8] != dst.i[i])
      abort ();
}

예제 #8

0

파일 보기

파일: evaluate_kppt.cpp 프로젝트: bottlenome/YaneuraOu

  // 現在の局面の評価値の内訳を表示する。
  void print_eval_stat(Position& pos)
  {
    cout << "--- EVAL STAT\n";

    Square sq_bk = pos.king_square(BLACK);
    Square sq_wk = pos.king_square(WHITE);
    const auto* ppkppb = kpp[sq_bk];
    const auto* ppkppw = kpp[Inv(sq_wk)];

    auto& pos_ = *const_cast<Position*>(&pos);

    auto list_fb = pos_.eval_list()->piece_list_fb();
    auto list_fw = pos_.eval_list()->piece_list_fw();

    int i, j;
    BonaPiece k0, k1, l0, l1;

    // 38枚の駒を表示
    for (i = 0; i < PIECE_NO_KING; ++i)
      cout << int(list_fb[i]) << " = " << list_fb[i] << " , " << int(list_fw[i]) << " =  " << list_fw[i] << endl;

    // 評価値の合計
    EvalSum sum;

    // SSE2は少なくとも有るという前提で。

    // sum.p[0](BKPP)とsum.p[1](WKPP)をゼロクリア
    sum.m[0] = _mm_setzero_si128();

    // KK
    sum.p[2] = kk[sq_bk][sq_wk];
    cout << "KKC : " << sq_bk << " " << sq_wk << " = " << kk[sq_bk][sq_wk][0] << " + " << kk[sq_bk][sq_wk][1] << "\n";

    for (i = 0; i < PIECE_NO_KING; ++i)
    {
      k0 = list_fb[i];
      k1 = list_fw[i];
      const auto* pkppb = ppkppb[k0];
      const auto* pkppw = ppkppw[k1];
      for (j = 0; j < i; ++j)
      {
        l0 = list_fb[j];
        l1 = list_fw[j];

#if 0
        sum.p[0] += pkppb[l0];
        sum.p[1] += pkppw[l1];
#else
        // SSEによる実装

        // pkppw[l1][0],pkppw[l1][1],pkppb[l0][0],pkppb[l0][1]の16bit変数4つを整数拡張で32bit化して足し合わせる
        __m128i tmp;
        tmp = _mm_set_epi32(0, 0, *reinterpret_cast<const int32_t*>(&pkppw[l1][0]), *reinterpret_cast<const int32_t*>(&pkppb[l0][0]));
        tmp = _mm_cvtepi16_epi32(tmp);
        sum.m[0] = _mm_add_epi32(sum.m[0], tmp);

        cout << "BKPP : " << sq_bk << " " << k0 << " " << l0 << " = " << pkppb[l0][0] << " + " << pkppb[l0][1] << "\n";
        cout << "WKPP : " << sq_wk << " " << k1 << " " << l1 << " = " << pkppw[l1][0] << " + " << pkppw[l1][1] << "\n";

#endif
      }
      sum.p[2] += kkp[sq_bk][sq_wk][k0];

      cout << "KKP : " << sq_bk << " " << sq_wk << " " << k0 << " = " << kkp[sq_bk][sq_wk][k0][0] << " + " << kkp[sq_bk][sq_wk][k0][1] << "\n";

    }

    cout << "Material = " << pos.state()->materialValue << endl;
    cout << sum;
    cout << "---\n";

  }

예제 #9

0

파일 보기

파일: sse41-builtins.c 프로젝트: ashwinma/clang_trunk

__m128i test_mm_cvtepi16_epi32(__m128i a) {
  // CHECK-LABEL: test_mm_cvtepi16_epi32
  // CHECK: call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> {{.*}})
  // CHECK-ASM: pmovsxwd %xmm{{.*}}, %xmm{{.*}}
  return _mm_cvtepi16_epi32(a);
}

예제 #10

0

파일 보기

파일: sse41-builtins.c 프로젝트: JaredCJR/clang

__m128i test_mm_cvtepi16_epi32(__m128i a) {
  // CHECK-LABEL: test_mm_cvtepi16_epi32
  // CHECK: shufflevector <8 x i16> {{.*}}, <8 x i16> {{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  // CHECK: sext <4 x i16> {{.*}} to <4 x i32>
  return _mm_cvtepi16_epi32(a);
}

예제 #11

0

파일 보기

파일: evaluate_apery.cpp 프로젝트: YasuhiroIke/Usapyon2-Slave

int Position::evaluate(const Color us) const
{
	int list0[NLIST], list1[NLIST];
	int sq_bk, sq_wk;
#ifndef TWIG
	int score;
#else
	EvalSum score;
#endif
	static int count=0;
	count++;
	int nlist=0;

	// 持ち駒をリスト化する
#define FOO(hand, Piece, list0_index, list1_index)    \
	for (int i = I2Hand##Piece(hand); i >= 1; --i) {  \
		list0[nlist] = list0_index + i;               \
		list1[nlist] = list1_index + i;               \
		++nlist; \
	}

	FOO(HAND_B, Pawn  , f_hand_pawn  , e_hand_pawn  )
	FOO(HAND_W, Pawn  , e_hand_pawn  , f_hand_pawn  )
	FOO(HAND_B, Lance , f_hand_lance , e_hand_lance )
	FOO(HAND_W, Lance , e_hand_lance , f_hand_lance )
	FOO(HAND_B, Knight, f_hand_knight, e_hand_knight)
	FOO(HAND_W, Knight, e_hand_knight, f_hand_knight)
	FOO(HAND_B, Silver, f_hand_silver, e_hand_silver)
	FOO(HAND_W, Silver, e_hand_silver, f_hand_silver)
	FOO(HAND_B, Gold  , f_hand_gold  , e_hand_gold  )
	FOO(HAND_W, Gold  , e_hand_gold  , f_hand_gold  )
	FOO(HAND_B, Bishop, f_hand_bishop, e_hand_bishop)
	FOO(HAND_W, Bishop, e_hand_bishop, f_hand_bishop)
	FOO(HAND_B, Rook  , f_hand_rook  , e_hand_rook  )
	FOO(HAND_W, Rook  , e_hand_rook  , f_hand_rook  )
#undef FOO

	nlist = make_list_apery(list0, list1, nlist);

	sq_bk = SQ_BKING;
	sq_wk = SQ_WKING;
	assert(0 <= sq_bk && sq_bk < nsquare);
	assert(0 <= sq_wk && sq_wk < nsquare);
	const auto* ppkppb = KPP[sq_bk     ];
	const auto* ppkppw = KPP[Inv(sq_wk)];

#ifndef TWIG
	score = fv_kk[sq_bk][sq_wk];
	for (int i = 0; i < nlist; i++ ) {
		const int k0 = list0[i];
		const int k1 = list1[i];
		assert(0 <= k0 && k0 < fe_end);
		assert(0 <= k1 && k1 < fe_end);
		const auto* pkppb = ppkppb[k0];
		const auto* pkppw = ppkppw[k1];
		for (int j = 0; j < i; j++ ) {
			const int l0 = list0[j];
			const int l1 = list1[j];
			assert(0 <= l0 && l0 < fe_end);
			assert(0 <= l1 && l1 < fe_end);
			score += pkppb[l0];
			score -= pkppw[l1];
		}
		score += fv_kkp[sq_bk][sq_wk][k0];
	}

	score += MATERIAL * FV_SCALE;
	score /= FV_SCALE;

	score = (us == BLACK) ? score : -score;

	return score;
#else
	EvalSum sum;
	sum.p[2] = KK[sq_bk][sq_wk];
#if defined USE_AVX2_EVAL || defined USE_SSE_EVAL
	sum.m[0] = _mm_setzero_si128();
	for (int i = 0; i < nlist; ++i) {
		const int k0 = list0[i];
		const int k1 = list1[i];
		const auto* pkppb = ppkppb[k0];
		const auto* pkppw = ppkppw[k1];
		for (int j = 0; j < i; ++j) {
			const int l0 = list0[j];
			const int l1 = list1[j];
			__m128i tmp;
			tmp = _mm_set_epi32(0, 0, *reinterpret_cast<const int32_t*>(&pkppw[l1][0]), *reinterpret_cast<const int32_t*>(&pkppb[l0][0]));
			tmp = _mm_cvtepi16_epi32(tmp);
			sum.m[0] = _mm_add_epi32(sum.m[0], tmp);
		}
		sum.p[2] += KKP[sq_bk][sq_wk][k0];
	}
	sum.p[2][0] += MATERIAL * FV_SCALE;

#else
	// loop 開始を i = 1 からにして、i = 0 の分のKKPを先に足す。
	sum.p[2] += KKP[sq_bk][sq_wk][list0[0]];
	sum.p[0][0] = 0;
	sum.p[0][1] = 0;
	sum.p[1][0] = 0;
	sum.p[1][1] = 0;
	for (int i = 1; i < nlist; ++i) {
		const int k0 = list0[i];
		const int k1 = list1[i];
		const auto* pkppb = ppkppb[k0];
		const auto* pkppw = ppkppw[k1];
		for (int j = 0; j < i; ++j) {
			const int l0 = list0[j];
			const int l1 = list1[j];
			sum.p[0] += pkppb[l0];
			sum.p[1] += pkppw[l1];
		}
		sum.p[2] += KKP[sq_bk][sq_wk][k0];
	}
	sum.p[2][0] += MATERIAL * FV_SCALE;
#endif

#ifdef _DEBUG
	score.p[2] = KK[sq_bk][sq_wk];

	score.p[0][0] = 0;
	score.p[0][1] = 0;
	score.p[1][0] = 0;
	score.p[1][1] = 0;

	for (int i = 0; i < nlist; ++i) {
		const int k0 = list0[i];
		const int k1 = list1[i];
		const auto* pkppb = ppkppb[k0];
		const auto* pkppw = ppkppw[k1];
		for (int j = 0; j < i; ++j) {
			const int l0 = list0[j];
			const int l1 = list1[j];
			score.p[0][0] += pkppb[l0][0];
			score.p[0][1] += pkppb[l0][1];
			score.p[1][0] += pkppw[l1][0];
			score.p[1][1] += pkppw[l1][1];
		}
		score.p[2][0] += KKP[sq_bk][sq_wk][k0][0];
		score.p[2][1] += KKP[sq_bk][sq_wk][k0][1];
	}

	score.p[2][0] += MATERIAL * FV_SCALE;
	assert(score.sum(us) == sum.sum(us));
#endif

	return sum.sum(us) / FV_SCALE ;

#endif
}