Esempio n. 1
0
// function that implements the kernel of the seismic modeling algorithm
void seismic_exec(float **VEL, float **PPF, float **APF, float **NPF, float* seismicPulseVector, int spPosX, int spPosY, int xDim, int yDim, int timeSteps)
{
  int i,j;  // spatial loops counters
  int t;    // time loop counter
  #ifdef _VERBOSE
    int progressTimer = -1;
  #endif

  // make sure packing _all_ the data into sets of 4 element is ok
  assert( xDim % 4 == 0 );

  #ifdef _VERBOSE
    printf("processing...\n");
    printf("point of explosion = %d, %d\n", spPosX, spPosY);
  #endif

  // there are 16 XMM registers in 64 bit mode, so there is no need to spill to stack
  __m128 s_ppf, s_vel, s_actual, s_above1, s_left1, s_under1, s_right1, s_two, s_sixteen, s_sixty;
  __m128 s_above2, s_under2, s_left2, s_right2;

  float two[4] = {2.0f, 2.0f, 2.0f, 2.0f };
  float sixteen[4] = {16.0f,16.0f,16.0f,16.0f};
  float sixty[4] = {60.f,60.f,60.f,60.f};

  // preload XMM registers with constant values.
  s_two = _mm_load_ps( two );
  s_sixteen = _mm_load_ps( sixteen );
  s_sixty = _mm_load_ps( sixty );

  // time loop
  for (t = 0; t < timeSteps; t++)
  {
    #ifdef _VVERBOSE
      printf("----------------------------------------------\ntimestep: %d\n\n", t );
    #endif

    // add pulse
    APF[spPosY][spPosX] += seismicPulseVector[t];

    for(i=2; i<(yDim-2); i++)
    {
      for(j=2 + ALIGNMENT_OFFSET; j<(xDim-2); j+=4)
      {
        s_ppf = _mm_load_ps( &(PPF[i][j]) );
        s_vel = _mm_load_ps( &(VEL[i][j]) );
        s_actual = _mm_load_ps( &(APF[i][j]) );

        s_left1 = _mm_load_ps( &(APF[i-1][j]) );
        s_left2 = _mm_load_ps( &(APF[i-2][j]) );
        s_right2 = _mm_load_ps( &(APF[i+2][j]) );
        s_right1 = _mm_load_ps( &(APF[i+1][j]) );
        s_above1 = _mm_loadu_ps( &(APF[i][j-1]) );
        s_under1 = _mm_loadu_ps( &(APF[i][j+1]) );
        s_above2 = _mm_loadl_pi( _mm_shuffle_ps(s_actual, s_actual, _MM_SHUFFLE(1, 0, 0, 0)),
                                &(APF[i][j-2]));

        s_under2 = _mm_loadh_pi( _mm_shuffle_ps(s_actual, s_actual, _MM_SHUFFLE(0, 0, 3, 2)),
                                &(APF[i][j+4]));

        // sum elements with an offset of one
        s_under1 = _mm_add_ps( s_under1, _mm_add_ps( s_above1, _mm_add_ps( s_left1, s_right1)));
        // sum elements with an offset of two
        s_above2 = _mm_add_ps( s_left2, _mm_add_ps( s_right2, _mm_add_ps( s_under2, s_above2)));
        // multiply with 16
        s_under1 = _mm_mul_ps( s_sixteen, s_under1 );
        //
        s_under1 = _mm_sub_ps( _mm_sub_ps( s_under1,  s_above2), _mm_mul_ps( s_sixty, s_actual ) );
        s_under1 = _mm_add_ps( _mm_mul_ps( s_vel, s_under1), _mm_sub_ps(_mm_mul_ps( s_two, s_actual ), s_ppf) );

        // save the result
        _mm_store_ps( &(NPF[i][j]), s_under1);

        #ifdef _VVERBOSE
          printf("[%d][%d]\n", i, j);
        #endif
      }
      #ifdef _VVERBOSE
        printf("\n");
      #endif
    }

    #ifdef _VERBOSE
//     shows one # at each 10% of the total processing time
      if (t/(timeSteps/10) > progressTimer )
      {
        printf("#");
        progressTimer++;
        fflush(stdout);
      }
    #endif

    // switch pointers instead of copying data
    PPF = APF;
    APF = NPF;
    NPF = PPF;
  }

  #ifdef _VERBOSE
    printf("\nend process!\n");
  #endif
}
int main()
{
#ifndef __EMSCRIPTEN__
	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
#endif

	printf ("{ \"workload\": %u, \"results\": [\n", N);
	assert(N%2 == 0); // Don't care about the tail for now.
	double *src = get_src_d();//(float*)aligned_alloc(16, N*sizeof(float));
	for(int i = 0; i < N; ++i)
		src[i] = (double)rand() / RAND_MAX;
	double *src2 = get_src2_d();//(float*)aligned_alloc(16, N*sizeof(float));
	for(int i = 0; i < N; ++i)
		src2[i] = (double)rand() / RAND_MAX;
	double *dst = get_dst_d();//(float*)aligned_alloc(16, N*sizeof(float));

	float scalarTime;
	SETCHART("load");
	START();
		for(int i = 0; i < N; ++i)
			dst[i] = src[i];
	ENDSCALAR(checksum_dst(dst), "scalar");

	LS_TEST("_mm_load_pd", _mm_load_pd, 0, _mm_store_pd, double*, 0, 2);
	LS_TEST("_mm_load_pd1", _mm_load_pd1, 1, _mm_store_pd, double*, 0, 2);
	LS_TEST("_mm_load_sd", _mm_load_sd, 1, _mm_store_pd, double*, 0, 2);
	// _mm_load_si128
	LS_TEST("_mm_load1_pd", _mm_load1_pd, 1, _mm_store_pd, double*, 0, 2);

	__m128d tempReg = _mm_set_pd(1.0, 2.0);
	LSH_TEST("_mm_loadh_pd", tempReg, _mm_loadh_pd, double*, 1, _mm_store_pd, double*, 0, 2);
	// _mm_loadl_epi64
	LSH_TEST("_mm_loadl_pd", tempReg, _mm_loadh_pd, double*, 1, _mm_store_pd, double*, 0, 2);

	LS_TEST("_mm_loadr_pd", _mm_loadr_pd, 0, _mm_store_pd, double*, 0, 2);
	LS_TEST("_mm_loadu_pd", _mm_loadu_pd, 1, _mm_store_pd, double*, 0, 2);
	// _mm_loadu_si128

	SETCHART("set");
/*	_mm_set_epi16
	_mm_set_epi32
	_mm_set_epi64
	_mm_set_epi64x
	_mm_set_epi8 */
	SS_TEST_D("_mm_set_pd", _mm_set_pd(src[i+2], src[i+0]));
	//SS_TEST_D("_mm_set_pd1", _mm_set_pd1(src[i]));
	SS_TEST_D("_mm_set_sd", _mm_set_sd(src[i]));
/*	_mm_set1_epi16
	_mm_set1_epi32
	_mm_set1_epi64
	_mm_set1_epi64x
	_mm_set1_epi8 */
	SS_TEST_D("_mm_set1_pd", _mm_set1_pd(src[i]));
/* 	_mm_setr_epi16
	_mm_setr_epi32
	_mm_setr_epi64
	_mm_setr_epi8 */
	SS_TEST_D("_mm_setr_pd", _mm_set_pd(src[i+2], src[i+0]));
	SS_TEST_D("_mm_setzero_pd", _mm_setzero_pd());
//	_mm_setzero_si128

	SETCHART("move");
	// _mm_move_epi64
	SS_TEST_D("_mm_move_sd", _mm_move_sd(_mm_load_pd(src+i), _mm_load_pd(src2+i)));

	SETCHART("store");
	// _mm_maskmoveu_si128
	LS_TEST("_mm_store_pd", _mm_load_pd, 0, _mm_store_pd, double*, 0, 2);
//	LS_TEST("_mm_store_pd1", _mm_load_pd, 0, _mm_store_pd1, double*, 0);
	LS_TEST("_mm_store_sd", _mm_load_pd, 0, _mm_store_sd, double*, 1, 2);
	// _mm_store_si128
	// _mm_store1_pd
	LS64_TEST("_mm_storeh_pi", _mm_load_pd, 0, _mm_storeh_pi, 1, 2);
	// _mm_storel_epi64
	LS64_TEST("_mm_storel_pi", _mm_load_pd, 0, _mm_storel_pi, 1, 2);
	LS_TEST("_mm_storer_pd", _mm_load_pd, 0, _mm_storer_pd, double*, 0, 2);
	LS_TEST("_mm_storeu_pd", _mm_load_pd, 0, _mm_storeu_pd, double*, 1, 2);
	// _mm_storeu_si128
	LS_TEST("_mm_stream_pd", _mm_load_pd, 0, _mm_stream_pd, double*, 0, 2);
	// _mm_stream_si128
	// _mm_stream_si32
	// _mm_stream_si64

	SETCHART("arithmetic");
	// _mm_add_epi16
	// _mm_add_epi32
	// _mm_add_epi64
	// _mm_add_epi8
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] += src2[0]; dst[1] += src2[1]; dst[2] += src2[2]; dst[3] += src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar add");
	BINARYOP_TEST_D("_mm_add_pd", _mm_add_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_add_sd", _mm_add_sd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_adds_epi16
	// _mm_adds_epi8
	// _mm_adds_epu16
	// _mm_adds_epu8
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] /= src2[0]; dst[1] /= src2[1]; dst[2] /= src2[2]; dst[3] /= src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar div");
	BINARYOP_TEST_D("_mm_div_pd", _mm_div_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_div_sd", _mm_div_sd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_madd_epi16
	// _mm_mul_epu32
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] *= src2[0]; dst[1] *= src2[1]; dst[2] *= src2[2]; dst[3] *= src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar mul");
	BINARYOP_TEST_D("_mm_mul_pd", _mm_mul_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_mul_sd", _mm_mul_sd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_mulhi_epi16
	// _mm_mulhi_epu16
	// _mm_mullo_epi16
	// _mm_sad_epu8
	// _mm_sub_epi16
	// _mm_sub_epi32
	// _mm_sub_epi64
	// _mm_sub_epi8
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] -= src2[0]; dst[1] -= src2[1]; dst[2] -= src2[2]; dst[3] -= src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar sub");
	BINARYOP_TEST_D("_mm_sub_pd", _mm_sub_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_sub_sd", _mm_sub_sd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_subs_epi16
	// _mm_subs_epi8
	// _mm_subs_epu16
	// _mm_subs_epu8

	SETCHART("roots");
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = sqrt(dst[0]); dst[1] = sqrt(dst[1]); dst[2] = sqrt(dst[2]); dst[3] = sqrt(dst[3]); } ENDSCALAR(checksum_dst(dst), "scalar sqrt");
	UNARYOP_TEST_D("_mm_sqrt_pd", _mm_sqrt_pd, _mm_load_pd(src));
//	UNARYOP_TEST_D("_mm_sqrt_sd", _mm_sqrt_sd, _mm_load_pd(src));

	SETCHART("logical");
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastd(dcastu(dst[0]) & dcastu(src2[0])); dst[1] = ucastd(dcastu(dst[1]) & dcastu(src2[1])); dst[2] = ucastd(dcastu(dst[2]) & dcastu(src2[2])); dst[3] = ucastd(dcastu(dst[3]) & dcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar and");
	BINARYOP_TEST_D("_mm_and_pd", _mm_and_pd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_and_si128
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastd((~dcastu(dst[0])) & dcastu(src2[0])); dst[1] = ucastd((~dcastu(dst[1])) & dcastu(src2[1])); dst[2] = ucastd((~dcastu(dst[2])) & dcastu(src2[2])); dst[3] = ucastd((~dcastu(dst[3])) & dcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar andnot");
	BINARYOP_TEST_D("_mm_andnot_pd", _mm_andnot_pd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_andnot_si128
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastd(dcastu(dst[0]) | dcastu(src2[0])); dst[1] = ucastd(dcastu(dst[1]) | dcastu(src2[1])); dst[2] = ucastd(dcastu(dst[2]) | dcastu(src2[2])); dst[3] = ucastd(dcastu(dst[3]) | dcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar or");
	BINARYOP_TEST_D("_mm_or_pd", _mm_or_pd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_or_si128
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastd(dcastu(dst[0]) ^ dcastu(src2[0])); dst[1] = ucastd(dcastu(dst[1]) ^ dcastu(src2[1])); dst[2] = ucastd(dcastu(dst[2]) ^ dcastu(src2[2])); dst[3] = ucastd(dcastu(dst[3]) ^ dcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar xor");
	BINARYOP_TEST_D("_mm_xor_pd", _mm_xor_pd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_xor_si128

	SETCHART("cmp");
	// _mm_cmpeq_epi16
	// _mm_cmpeq_epi32
	// _mm_cmpeq_epi8
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] == src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] == src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] == src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] == src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp==");
	BINARYOP_TEST_D("_mm_cmpeq_pd", _mm_cmpeq_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_cmpeq_sd", _mm_cmpeq_sd, _mm_load_pd(src), _mm_load_pd(src2));
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] >= src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] >= src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] >= src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] >= src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp>=");
	BINARYOP_TEST_D("_mm_cmpge_pd", _mm_cmpge_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_cmpge_sd", _mm_cmpge_sd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_cmpgt_epi16
	// _mm_cmpgt_epi32
	// _mm_cmpgt_epi8
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] > src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] > src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] > src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] > src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp>");
	BINARYOP_TEST_D("_mm_cmpgt_pd", _mm_cmpgt_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_cmpgt_sd", _mm_cmpgt_sd, _mm_load_pd(src), _mm_load_pd(src2));
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] <= src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] <= src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] <= src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] <= src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp<=");
	BINARYOP_TEST_D("_mm_cmple_pd", _mm_cmple_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_cmple_sd", _mm_cmple_sd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_cmplt_epi16
	// _mm_cmplt_epi32
	// _mm_cmplt_epi8
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] < src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] < src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] < src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] < src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp<");
	BINARYOP_TEST_D("_mm_cmplt_pd", _mm_cmplt_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_cmplt_sd", _mm_cmplt_sd, _mm_load_pd(src), _mm_load_pd(src2));

	/*_mm_cmpneq_pd
	_mm_cmpneq_sd
	_mm_cmpnge_pd
	_mm_cmpnge_sd
	_mm_cmpngt_pd
	_mm_cmpngt_sd
	_mm_cmpnle_pd
	_mm_cmpnle_sd
	_mm_cmpnlt_pd
	_mm_cmpnlt_sd*/

	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (!Isnan(dst[0]) && !Isnan(src2[0])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (!Isnan(dst[1]) && !Isnan(src2[1])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (!Isnan(dst[2]) && !Isnan(src2[2])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (!Isnan(dst[3]) && !Isnan(src2[3])) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmpord");
	BINARYOP_TEST_D("_mm_cmpord_pd", _mm_cmpord_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_cmpord_sd", _mm_cmpord_sd, _mm_load_pd(src), _mm_load_pd(src2));
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (Isnan(dst[0]) || Isnan(src2[0])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (Isnan(dst[1]) || Isnan(src2[1])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (Isnan(dst[2]) || Isnan(src2[2])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (Isnan(dst[3]) || Isnan(src2[3])) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmpunord");
	BINARYOP_TEST_D("_mm_cmpunord_pd", _mm_cmpunord_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_cmpunord_sd", _mm_cmpunord_sd, _mm_load_pd(src), _mm_load_pd(src2));

	SETCHART("max");
	// _mm_max_epi16
	// _mm_max_epu8
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = Max(dst[0], src2[0]); dst[1] = Max(dst[1], src2[1]); dst[2] = Max(dst[2], src2[2]); dst[3] = Max(dst[3], src2[3]); } ENDSCALAR(checksum_dst(dst), "scalar max");
	BINARYOP_TEST_D("_mm_max_pd", _mm_max_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_max_sd", _mm_max_sd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_min_epi16
	// _mm_min_epu8
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = Min(dst[0], src2[0]); dst[1] = Min(dst[1], src2[1]); dst[2] = Min(dst[2], src2[2]); dst[3] = Min(dst[3], src2[3]); } ENDSCALAR(checksum_dst(dst), "scalar min");
	BINARYOP_TEST_D("_mm_min_pd", _mm_min_pd, _mm_load_pd(src), _mm_load_pd(src2));
	BINARYOP_TEST_D("_mm_min_sd", _mm_min_sd, _mm_load_pd(src), _mm_load_pd(src2));

	SETCHART("shuffle");
	// _mm_extract_epi16
	// _mm_insert_epi16
	// _mm_shuffle_epi32
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[3] = dst[1]; dst[2] = dst[0]; dst[1] = src2[3]; dst[0] = src2[2]; } ENDSCALAR(checksum_dst(dst), "scalar shuffle");
//	BINARYOP_TEST_D("_mm_shuffle_pd", _mm_shuffle_pd, _mm_load_pd(src), _mm_load_pd(src2));
	START();
		__m128 o0 = _mm_load_pd(src);
		__m128 o1 = _mm_load_pd(src2);
		for(int i = 0; i < N; i += 4)
			o0 = _mm_shuffle_pd(o0, o1, _MM_SHUFFLE(1, 0, 3, 2));
		_mm_store_pd(dst, o0);
	END(checksum_dst(dst), "_mm_shuffle_pd");

	// _mm_shufflehi_epi16
	// _mm_shufflelo_epi16
	// _mm_unpackhi_epi16
	// _mm_unpackhi_epi32
	// _mm_unpackhi_epi64
	// _mm_unpackhi_epi8
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = dst[2]; dst[1] = src2[2]; dst[2] = dst[3]; dst[3] = src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar unpackhi_pd");
	BINARYOP_TEST_D("_mm_unpackhi_pd", _mm_unpackhi_pd, _mm_load_pd(src), _mm_load_pd(src2));
	// _mm_unpacklo_epi16
	// _mm_unpacklo_epi32
	// _mm_unpacklo_epi64
	// _mm_unpacklo_epi8
	START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[2] = dst[1]; dst[1] = dst[0]; dst[0] = src2[0]; dst[3] = src2[1]; } ENDSCALAR(checksum_dst(dst), "scalar unpacklo_pd");
	BINARYOP_TEST_D("_mm_unpacklo_pd", _mm_unpacklo_pd, _mm_load_pd(src), _mm_load_pd(src2));
	printf("]}\n");
/*
	printf("Finished!\n");
	printf("Total time spent in scalar intrinsics: %f msecs.\n", (double)scalarTotalTicks * 1000.0 / ticks_per_sec());
	printf("Total time spent in SSE1 intrinsics: %f msecs.\n", (double)simdTotalTicks * 1000.0 / ticks_per_sec());
	if (scalarTotalTicks > simdTotalTicks)
		printf("SSE1 was %.3fx faster than scalar!\n", (double)scalarTotalTicks / simdTotalTicks);
	else
		printf("SSE1 was %.3fx slower than scalar!\n", (double)simdTotalTicks / scalarTotalTicks);
*/
#ifdef __EMSCRIPTEN__
	fprintf(stderr,"User Agent: %s\n", emscripten_run_script_string("navigator.userAgent"));
	printf("/*Test finished! Now please close Firefox to continue with benchmark_sse2.py.*/\n");
#endif
	exit(0);
}
Esempio n. 3
0
OD_SIMD_INLINE int32_t od_mc_compute_satd8_8x8_part(const unsigned char *src, int systride,
 const unsigned char *ref, int rystride) {
  int32_t satd;
  __m128i sums;
  __m128i a;
  __m128i b;
  __m128i c;
  __m128i d;
  __m128i e;
  __m128i f;
  __m128i g;
  __m128i h;
  a = od_load_convert_subtract_x8(src + 0*systride, ref + 0*rystride);
  b = od_load_convert_subtract_x8(src + 1*systride, ref + 1*rystride);
  c = od_load_convert_subtract_x8(src + 2*systride, ref + 2*rystride);
  d = od_load_convert_subtract_x8(src + 3*systride, ref + 3*rystride);
  e = od_load_convert_subtract_x8(src + 4*systride, ref + 4*rystride);
  f = od_load_convert_subtract_x8(src + 5*systride, ref + 5*rystride);
  g = od_load_convert_subtract_x8(src + 6*systride, ref + 6*rystride);
  h = od_load_convert_subtract_x8(src + 7*systride, ref + 7*rystride);
  /*Vertical 1D transform.*/
  od_mc_butterfly_2x2_16x8(&a, &b, &c, &d);
  od_mc_butterfly_2x2_16x8(&e, &f, &g, &h);
  od_mc_butterfly_2x2_16x8(&a, &b, &e, &f);
  od_mc_butterfly_2x2_16x8(&c, &d, &g, &h);
  od_mc_butterfly_2x2_16x8(&a, &b, &e, &f);
  od_mc_butterfly_2x2_16x8(&c, &d, &g, &h);
  od_transpose16x8(&a, &c, &b, &d, &e, &g, &f, &h);
  /*Horizontal 1D transform.*/
  od_mc_butterfly_2x2_16x8(&a, &b, &c, &d);
  od_mc_butterfly_2x2_16x8(&e, &f, &g, &h);
  od_mc_butterfly_2x2_16x8(&a, &b, &e, &f);
  od_mc_butterfly_2x2_16x8(&c, &d, &g, &h);
  /*Use the fact that (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) to merge the
     final butterfly stage with the calculating the absolute values and the
     first stage of accumulation.
    Calculates (abs(a+b)+abs(a-b))/2-0x7FFF.
    An offset must be added to the final sum before rounding to account for
     subtracting 0x7FFF.*/
  a = _mm_sub_epi16(_mm_max_epi16(a, b), _mm_adds_epi16(_mm_add_epi16(a, b),
   _mm_set1_epi16(0x7FFF)));
  e = _mm_sub_epi16(_mm_max_epi16(e, f), _mm_adds_epi16(_mm_add_epi16(e, f),
   _mm_set1_epi16(0x7FFF)));
  c = _mm_sub_epi16(_mm_max_epi16(c, d), _mm_adds_epi16(_mm_add_epi16(c, d),
   _mm_set1_epi16(0x7FFF)));
  g = _mm_sub_epi16(_mm_max_epi16(g, h), _mm_adds_epi16(_mm_add_epi16(g, h),
   _mm_set1_epi16(0x7FFF)));
  a = _mm_add_epi16(a, e);
  c = _mm_add_epi16(c, g);
  /*Convert to 32-bit unsigned integers and sum horizontally using madd to
    avoid overflowing 16-bit unsigned integers.
    The naively calculated max values of a and c are
     ((8 rows * 8 cols * 256) * 2 / 2 + 1 (offset)) * 2 or 0x8002.
    The actual max is lower so it is safe to use _mm_madd_epi16.*/
  a = _mm_madd_epi16(a, _mm_set1_epi16(1));
  c = _mm_madd_epi16(c, _mm_set1_epi16(1));
  sums = _mm_add_epi32(a, c);
  /*Sum the elements of the vector.*/
  sums = _mm_add_epi32(sums, _mm_shuffle_epi32(sums, _MM_SHUFFLE(0, 1, 2, 3)));
  sums = _mm_add_epi32(sums, _mm_shuffle_epi32(sums, _MM_SHUFFLE(2, 3, 0, 1)));
  satd = _mm_cvtsi128_si32(sums);
  /*Subtract the offset (32) and round.*/
  satd = (satd + 2 - 32) >> 2;
#if defined(OD_CHECKASM)
  {
    int32_t c_satd;
    c_satd = od_mc_compute_satd8_8x8_c(src, systride, ref, rystride);
    if (satd != c_satd) {
      fprintf(stderr, "od_mc_compute_satd %ix%i check failed: %i!=%i\n",
      8, 8, satd, c_satd);
    }
  }
#endif
  return satd;
}
Esempio n. 4
0
      int32max
    );

    MEMALIGN(16, __m128i cur_seed_split);
    MEMALIGN(16, __m128i multiplier);
    MEMALIGN(16, __m128i adder);
    MEMALIGN(16, __m128i mod_mask);
    MEMALIGN(16, __m128 res);
    MEMALIGN(16, static const unsigned int mult  [4]) = {214013, 17405, 214013, 69069};
    MEMALIGN(16, static const unsigned int gadd  [4]) = {2531011, 10395331, 13737667, 1};
    MEMALIGN(16, static const unsigned int mask  [4]) = {0xFFFFFFFF, 0, 0xFFFFFFFF, 0};

    adder          = _mm_load_si128((__m128i*)gadd);
    multiplier     = _mm_load_si128((__m128i*)mult);
    mod_mask       = _mm_load_si128((__m128i*)mask);
    cur_seed_split = _mm_shuffle_epi32(m_sseSeed, _MM_SHUFFLE(2, 3, 0, 1));

    m_sseSeed      = _mm_mul_epu32(m_sseSeed, multiplier);
    multiplier     = _mm_shuffle_epi32(multiplier, _MM_SHUFFLE(2, 3, 0, 1));
    cur_seed_split = _mm_mul_epu32(cur_seed_split, multiplier);

    m_sseSeed      = _mm_and_si128(m_sseSeed, mod_mask);
    cur_seed_split = _mm_and_si128(cur_seed_split, mod_mask);
    cur_seed_split = _mm_shuffle_epi32(cur_seed_split, _MM_SHUFFLE(2, 3, 0, 1));
    m_sseSeed      = _mm_or_si128(m_sseSeed, cur_seed_split);
    m_sseSeed      = _mm_add_epi32(m_sseSeed, adder);

    /* adjust the value to the range requested */
    res = _mm_cvtepi32_ps(m_sseSeed);
    if (sseresult)
      *sseresult = _mm_mul_ps(res, f);
void GSSetupPrimCodeGenerator::Depth()
{
	if(!m_en.z && !m_en.f)
	{
		return;
	}

	if(!m_sel.sprite)
	{
		// GSVector4 t = dscan.p;

		movaps(xmm0, xmmword[edx + 16]);

		if(m_en.f)
		{
			// GSVector4 df = p.wwww();

			movaps(xmm1, xmm0);
			shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));

			// m_env.d4.f = GSVector4i(df * 4.0f).xxzzlh();

			movaps(xmm2, xmm1);
			mulps(xmm2, xmm3);
			cvttps2dq(xmm2, xmm2);
			pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
			pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
			movdqa(xmmword[&m_env.d4.f], xmm2);

			for(int i = 0; i < 4; i++)
			{
				// m_env.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();

				movaps(xmm2, xmm1);
				mulps(xmm2, Xmm(4 + i));
				cvttps2dq(xmm2, xmm2);
				pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
				pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
				movdqa(xmmword[&m_env.d[i].f], xmm2);
			}
		}

		if(m_en.z)
		{
			// GSVector4 dz = p.zzzz();

			shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));

			// m_env.d4.z = dz * 4.0f;

			movaps(xmm1, xmm0);
			mulps(xmm1, xmm3);
			movdqa(xmmword[&m_env.d4.z], xmm1);

			for(int i = 0; i < 4; i++)
			{
				// m_env.d[i].z = dz * m_shift[i];

				movaps(xmm1, xmm0);
				mulps(xmm1, Xmm(4 + i));
				movdqa(xmmword[&m_env.d[i].z], xmm1);
			}
		}
	}
	else
	{
		// GSVector4 p = vertices[0].p;

		movaps(xmm0, xmmword[ecx + 16]);

		if(m_en.f)
		{
			// m_env.p.f = GSVector4i(p).zzzzh().zzzz();

			movaps(xmm1, xmm0);
			cvttps2dq(xmm1, xmm1);
			pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
			pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
			movdqa(xmmword[&m_env.p.f], xmm1);
		}

		if(m_en.z)
		{
			// GSVector4 z = p.zzzz();

			shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));

			if(m_sel.zoverflow)
			{
				// m_env.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());

				static const float half = 0.5f;

				movss(xmm1, dword[&half]);
				shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
				mulps(xmm1, xmm0);
				cvttps2dq(xmm1, xmm1);
				pslld(xmm1, 1);

				cvttps2dq(xmm0, xmm0);
				pcmpeqd(xmm2, xmm2);
				psrld(xmm2, 31);
				pand(xmm0, xmm2);

				por(xmm0, xmm1);
			}
			else
			{
				// m_env.p.z = GSVector4i(z);

				cvttps2dq(xmm0, xmm0);
			}

			movdqa(xmmword[&m_env.p.z], xmm0);
		}
	}
}
Esempio n. 6
0
gboolean
gimp_operation_normal_process_sse2 (GeglOperation       *op,
                                    void                *in_p,
                                    void                *layer_p,
                                    void                *mask_p,
                                    void                *out_p,
                                    glong                samples,
                                    const GeglRectangle *roi,
                                    gint                 level)
{
  /* check alignment */
  if ((((uintptr_t)in_p) | ((uintptr_t)layer_p) | ((uintptr_t)out_p)) & 0x0F)
    {
      return gimp_operation_normal_process (op,
                                            in_p, layer_p, mask_p, out_p,
                                            samples, roi, level);
    }
  else
    {
      GimpOperationLayerMode *layer_mode      = (gpointer) op;
      gfloat                  opacity         = layer_mode->opacity;
      gfloat                 *mask            = mask_p;
      const                   __v4sf *v_in    = (const __v4sf*) in_p;
      const                   __v4sf *v_layer = (const __v4sf*) layer_p;
                              __v4sf *v_out   = (      __v4sf*) out_p;

      const __v4sf one       = _mm_set1_ps (1.0f);
      const __v4sf v_opacity = _mm_set1_ps (opacity);

      switch (layer_mode->real_composite_mode)
        {
        case GIMP_LAYER_COMPOSITE_UNION:
        case GIMP_LAYER_COMPOSITE_AUTO:
          while (samples--)
            {
              __v4sf rgba_in, rgba_layer, alpha;

              rgba_in    = *v_in++;
              rgba_layer = *v_layer++;

              /* expand alpha */
              alpha = (__v4sf)_mm_shuffle_epi32 ((__m128i)rgba_layer,
                                                 _MM_SHUFFLE (3, 3, 3, 3));

              if (mask)
                {
                  __v4sf mask_alpha;

                  /* multiply layer's alpha by the mask */
                  mask_alpha = _mm_set1_ps (*mask++);
                  alpha = alpha * mask_alpha;
                }

              alpha = alpha * v_opacity;

              if (_mm_ucomigt_ss (alpha, _mm_setzero_ps ()))
                {
                  __v4sf dst_alpha, a_term, out_pixel, out_alpha, out_pixel_rbaa;

                  /* expand alpha */
                  dst_alpha = (__v4sf)_mm_shuffle_epi32 ((__m128i)rgba_in,
                                                         _MM_SHUFFLE (3, 3, 3, 3));

                  /* a_term = dst_a * (1.0 - src_a) */
                  a_term = dst_alpha * (one - alpha);

                  /* out(color) = src * src_a + dst * a_term */
                  out_pixel = rgba_layer * alpha + rgba_in * a_term;

                  /* out(alpha) = 1.0 * src_a + 1.0 * a_term */
                  out_alpha = alpha + a_term;

                  /* un-premultiply */
                  out_pixel = out_pixel / out_alpha;

                  /* swap in the real alpha */
                  out_pixel_rbaa = _mm_shuffle_ps (out_pixel, out_alpha, _MM_SHUFFLE (3, 3, 2, 0));
                  out_pixel = _mm_shuffle_ps (out_pixel, out_pixel_rbaa, _MM_SHUFFLE (2, 1, 1, 0));

                  *v_out++ = out_pixel;
                }
              else
                {
                  *v_out++ = rgba_in;
                }
            }
          break;

        case GIMP_LAYER_COMPOSITE_CLIP_TO_BACKDROP:
          while (samples--)
            {
              __v4sf rgba_in, rgba_layer, alpha;

              rgba_in    = *v_in++;
              rgba_layer = *v_layer++;

              /* expand alpha */
              alpha = (__v4sf)_mm_shuffle_epi32 ((__m128i)rgba_layer,
                                                 _MM_SHUFFLE (3, 3, 3, 3));

              if (mask)
                {
                  __v4sf mask_alpha;

                  /* multiply layer's alpha by the mask */
                  mask_alpha = _mm_set1_ps (*mask++);
                  alpha = alpha * mask_alpha;
                }

              alpha = alpha * v_opacity;

              if (_mm_ucomigt_ss (alpha, _mm_setzero_ps ()))
                {
                  __v4sf dst_alpha, out_pixel, out_pixel_rbaa;

                  /* expand alpha */
                  dst_alpha = (__v4sf)_mm_shuffle_epi32 ((__m128i)rgba_in,
                                                         _MM_SHUFFLE (3, 3, 3, 3));

                  /* out(color) = dst * (1 - src_a) + src * src_a */
                  out_pixel = rgba_in + (rgba_layer - rgba_in) * alpha;

                  /* swap in the real alpha */
                  out_pixel_rbaa = _mm_shuffle_ps (out_pixel, dst_alpha, _MM_SHUFFLE (3, 3, 2, 0));
                  out_pixel = _mm_shuffle_ps (out_pixel, out_pixel_rbaa, _MM_SHUFFLE (2, 1, 1, 0));

                  *v_out++ = out_pixel;
                }
              else
                {
                  *v_out++ = rgba_in;
                }
            }
          break;

        case GIMP_LAYER_COMPOSITE_CLIP_TO_LAYER:
          while (samples--)
            {
              __v4sf rgba_in, rgba_layer, alpha;
              __v4sf out_pixel, out_pixel_rbaa;

              rgba_in    = *v_in++;
              rgba_layer = *v_layer++;

              /* expand alpha */
              alpha = (__v4sf)_mm_shuffle_epi32 ((__m128i)rgba_layer,
                                                 _MM_SHUFFLE (3, 3, 3, 3));

              if (mask)
                {
                  __v4sf mask_alpha;

                  /* multiply layer's alpha by the mask */
                  mask_alpha = _mm_set1_ps (*mask++);
                  alpha = alpha * mask_alpha;
                }

              alpha = alpha * v_opacity;

              if (_mm_ucomigt_ss (alpha, _mm_setzero_ps ()))
                {
                  /* out(color) = src */
                  out_pixel = rgba_layer;
                }
              else
                {
                  out_pixel = rgba_in;
                }

              /* swap in the real alpha */
              out_pixel_rbaa = _mm_shuffle_ps (out_pixel, alpha, _MM_SHUFFLE (3, 3, 2, 0));
              out_pixel = _mm_shuffle_ps (out_pixel, out_pixel_rbaa, _MM_SHUFFLE (2, 1, 1, 0));

              *v_out++ = out_pixel;
            }
          break;

        case GIMP_LAYER_COMPOSITE_INTERSECTION:
          while (samples--)
            {
              __v4sf rgba_in, rgba_layer, alpha;
              __v4sf out_pixel, out_pixel_rbaa;

              rgba_in    = *v_in++;
              rgba_layer = *v_layer++;

              /* expand alpha */
              alpha = (__v4sf)_mm_shuffle_epi32 ((__m128i)rgba_layer,
                                                 _MM_SHUFFLE (3, 3, 3, 3));

              if (mask)
                {
                  __v4sf mask_alpha;

                  /* multiply layer's alpha by the mask */
                  mask_alpha = _mm_set1_ps (*mask++);
                  alpha = alpha * mask_alpha;
                }

              alpha = alpha * v_opacity;

              /* multiply the alpha by in's alpha */
              alpha *= (__v4sf)_mm_shuffle_epi32 ((__m128i)rgba_in,
                                                  _MM_SHUFFLE (3, 3, 3, 3));

              if (_mm_ucomigt_ss (alpha, _mm_setzero_ps ()))
                {
                  /* out(color) = src */
                  out_pixel = rgba_layer;
                }
              else
                {
                  out_pixel = rgba_in;
                }

              /* swap in the real alpha */
              out_pixel_rbaa = _mm_shuffle_ps (out_pixel, alpha, _MM_SHUFFLE (3, 3, 2, 0));
              out_pixel = _mm_shuffle_ps (out_pixel, out_pixel_rbaa, _MM_SHUFFLE (2, 1, 1, 0));

              *v_out++ = out_pixel;
            }
          break;
        }
    }

  return TRUE;
}
void GSSetupPrimCodeGenerator::Texture()
{
	if(!m_en.t)
	{
		return;
	}

	// GSVector4 t = dscan.t;

	movaps(xmm0, xmmword[edx + 32]);

	movaps(xmm1, xmm0);
	mulps(xmm1, xmm3);

	if(m_sel.fst)
	{
		// m_env.d4.st = GSVector4i(t * 4.0f);

		cvttps2dq(xmm1, xmm1);
		movdqa(xmmword[&m_env.d4.st], xmm1);
	}
	else
	{
		// m_env.d4.stq = t * 4.0f;

		movaps(xmmword[&m_env.d4.stq], xmm1);
	}

	for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
	{
		// GSVector4 ds = t.xxxx();
		// GSVector4 dt = t.yyyy();
		// GSVector4 dq = t.zzzz();

		movaps(xmm1, xmm0);
		shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));

		for(int i = 0; i < 4; i++)
		{
			// GSVector4 v = ds/dt * m_shift[i];

			movaps(xmm2, xmm1);
			mulps(xmm2, Xmm(4 + i));

			if(m_sel.fst)
			{
				// m_env.d[i].si/ti = GSVector4i(v);

				cvttps2dq(xmm2, xmm2);

				switch(j)
				{
				case 0: movdqa(xmmword[&m_env.d[i].si], xmm2); break;
				case 1: movdqa(xmmword[&m_env.d[i].ti], xmm2); break;
				}
			}
			else
			{
				// m_env.d[i].s/t/q = v;

				switch(j)
				{
				case 0: movaps(xmmword[&m_env.d[i].s], xmm2); break;
				case 1: movaps(xmmword[&m_env.d[i].t], xmm2); break;
				case 2: movaps(xmmword[&m_env.d[i].q], xmm2); break;
				}
			}
		}
	}
}
void GSSetupPrimCodeGenerator::Color()
{
	if(!m_en.c)
	{
		return;
	}

	if(m_sel.iip)
	{
		// GSVector4 c = dscan.c;

		movaps(xmm0, xmmword[edx]);
		movaps(xmm1, xmm0);

		// m_env.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();

		movaps(xmm2, xmm0);
		mulps(xmm2, xmm3);
		cvttps2dq(xmm2, xmm2);
		pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
		packssdw(xmm2, xmm2);
		movdqa(xmmword[&m_env.d4.c], xmm2);

		// xmm3 is not needed anymore

		// GSVector4 dr = c.xxxx();
		// GSVector4 db = c.zzzz();

		shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
		shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));

		for(int i = 0; i < 4; i++)
		{
			// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();

			movaps(xmm2, xmm0);
			mulps(xmm2, Xmm(4 + i));
			cvttps2dq(xmm2, xmm2);
			packssdw(xmm2, xmm2);

			// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();

			movaps(xmm3, xmm1);
			mulps(xmm3, Xmm(4 + i));
			cvttps2dq(xmm3, xmm3);
			packssdw(xmm3, xmm3);

			// m_env.d[i].rb = r.upl16(b);

			punpcklwd(xmm2, xmm3);
			movdqa(xmmword[&m_env.d[i].rb], xmm2);
		}

		// GSVector4 c = dscan.c;

		movaps(xmm0, xmmword[edx]); // not enough regs, have to reload it
		movaps(xmm1, xmm0);

		// GSVector4 dg = c.yyyy();
		// GSVector4 da = c.wwww();

		shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
		shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));

		for(int i = 0; i < 4; i++)
		{
			// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();

			movaps(xmm2, xmm0);
			mulps(xmm2, Xmm(4 + i));
			cvttps2dq(xmm2, xmm2);
			packssdw(xmm2, xmm2);

			// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();

			movaps(xmm3, xmm1);
			mulps(xmm3, Xmm(4 + i));
			cvttps2dq(xmm3, xmm3);
			packssdw(xmm3, xmm3);

			// m_env.d[i].ga = g.upl16(a);

			punpcklwd(xmm2, xmm3);
			movdqa(xmmword[&m_env.d[i].ga], xmm2);
		}
	}
	else
	{
		// GSVector4i c = GSVector4i(vertices[0].c);

		movaps(xmm0, xmmword[ecx]);
		cvttps2dq(xmm0, xmm0);

		// c = c.upl16(c.zwxy());

		movdqa(xmm1, xmm0);
		pshufd(xmm1, xmm1, _MM_SHUFFLE(1, 0, 3, 2));
		punpcklwd(xmm0, xmm1);

		// if(!tme) c = c.srl16(7);

		if(m_sel.tfx == TFX_NONE)
		{
			psrlw(xmm0, 7);
		}

		// m_env.c.rb = c.xxxx();
		// m_env.c.ga = c.zzzz();

		movdqa(xmm1, xmm0);
		pshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
		pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
		movdqa(xmmword[&m_env.c.rb], xmm0);
		movdqa(xmmword[&m_env.c.ga], xmm1);
	}
}
Esempio n. 9
0
unsigned FLAC__fixed_compute_best_predictor_intrin_ssse3(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1])
{
	FLAC__uint32 total_error_0, total_error_1, total_error_2, total_error_3, total_error_4;
	unsigned i, order;

	__m128i total_err0, total_err1, total_err2;

	{
		FLAC__int32 itmp;
		__m128i last_error;

		last_error = _mm_cvtsi32_si128(data[-1]);							// 0   0   0   le0
		itmp = data[-2];
		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// 0   0   le0 le1
		itmp -= data[-3];
		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// 0   le0 le1 le2
		itmp -= data[-3] - data[-4];
		last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
		last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp));	// le0 le1 le2 le3

		total_err0 = total_err1 = _mm_setzero_si128();
		for(i = 0; i < data_len; i++) {
			__m128i err0, err1;
			err0 = _mm_cvtsi32_si128(data[i]);								// 0   0   0   e0
			err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0));			// e0  e0  e0  e0
#if 1 /* OPT_SSE */
			err1 = _mm_sub_epi32(err1, last_error);
			last_error = _mm_srli_si128(last_error, 4);						// 0   le0 le1 le2
			err1 = _mm_sub_epi32(err1, last_error);
			last_error = _mm_srli_si128(last_error, 4);						// 0   0   le0 le1
			err1 = _mm_sub_epi32(err1, last_error);
			last_error = _mm_srli_si128(last_error, 4);						// 0   0   0   le0
			err1 = _mm_sub_epi32(err1, last_error);							// e1  e2  e3  e4
#else
			last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8));	// le0  le1  le2+le0  le3+le1
			last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4));	// le0  le1+le0  le2+le0+le1  le3+le1+le2+le0
			err1 = _mm_sub_epi32(err1, last_error);							// e1  e2  e3  e4
#endif
			last_error = _mm_alignr_epi8(err0, err1, 4);					// e0  e1  e2  e3

			err0 = _mm_abs_epi32(err0);
			err1 = _mm_abs_epi32(err1);

			total_err0 = _mm_add_epi32(total_err0, err0);					// 0   0   0   te0
			total_err1 = _mm_add_epi32(total_err1, err1);					// te1 te2 te3 te4
		}
	}
	
	total_error_0 = _mm_cvtsi128_si32(total_err0);
	total_err2 = total_err1;											// te1  te2  te3  te4
	total_err1 = _mm_srli_si128(total_err1, 8);							//  0    0   te1  te2
	total_error_4 = _mm_cvtsi128_si32(total_err2);
	total_error_2 = _mm_cvtsi128_si32(total_err1);
	total_err2 = _mm_srli_si128(total_err2,	4);							//  0   te1  te2  te3
	total_err1 = _mm_srli_si128(total_err1, 4);							//  0    0    0   te1
	total_error_3 = _mm_cvtsi128_si32(total_err2);
	total_error_1 = _mm_cvtsi128_si32(total_err1);

	/* prefer higher order */
	if(total_error_0 < flac_min(flac_min(flac_min(total_error_1, total_error_2), total_error_3), total_error_4))
		order = 0;
	else if(total_error_1 < flac_min(flac_min(total_error_2, total_error_3), total_error_4))
		order = 1;
	else if(total_error_2 < flac_min(total_error_3, total_error_4))
		order = 2;
	else if(total_error_3 < total_error_4)
		order = 3;
	else
		order = 4;

	/* Estimate the expected number of bits per residual signal sample. */
	/* 'total_error*' is linearly related to the variance of the residual */
	/* signal, so we use it directly to compute E(|x|) */
	FLAC__ASSERT(data_len > 0 || total_error_0 == 0);
	FLAC__ASSERT(data_len > 0 || total_error_1 == 0);
	FLAC__ASSERT(data_len > 0 || total_error_2 == 0);
	FLAC__ASSERT(data_len > 0 || total_error_3 == 0);
	FLAC__ASSERT(data_len > 0 || total_error_4 == 0);

	residual_bits_per_sample[0] = (FLAC__float)((total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
	residual_bits_per_sample[1] = (FLAC__float)((total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
	residual_bits_per_sample[2] = (FLAC__float)((total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
	residual_bits_per_sample[3] = (FLAC__float)((total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
	residual_bits_per_sample[4] = (FLAC__float)((total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);

	return order;
}
Esempio n. 10
0
// count genotype sum and number of calls, not requiring 16-aligned p
COREARRAY_DLL_DEFAULT C_UInt8* vec_u8_geno_count(C_UInt8 *p,
	size_t n, C_Int32 &out_sum, C_Int32 &out_num)
{
	C_Int32 sum=0, num=0;

#if defined(COREARRAY_SIMD_AVX2)

	const __m256i three = _mm256_set1_epi8(3);
	const __m256i zero = _mm256_setzero_si256();
	__m256i sum32 = zero, num32 = zero;
	size_t limit_by_U8 = 0;

	for (; n >= 32; )
	{
		__m256i v = _mm256_loadu_si256((__m256i const*)p);
		p += 32;
		__m256i m = _mm256_cmpgt_epi8(three, _mm256_min_epu8(v, three));
		sum32 = _mm256_add_epi8(sum32, _mm256_and_si256(v, m));
		num32 = _mm256_sub_epi8(num32, m);
		n -= 32;
		limit_by_U8 ++;
		if ((limit_by_U8 >= 127) || (n < 32))
		{
			// add to sum
			sum32 = _mm256_sad_epu8(sum32, zero);
			sum32 = _mm256_add_epi32(sum32,
				_mm256_permute4x64_epi64(sum32, _MM_SHUFFLE(1,0,3,2)));
			sum32 = _mm256_add_epi32(sum32,
				_mm256_permute4x64_epi64(sum32, _MM_SHUFFLE(0,0,0,1)));
			sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(sum32));
			// add to num
			num32 = _mm256_sad_epu8(num32, zero);
			num32 = _mm256_add_epi32(num32,
				_mm256_permute4x64_epi64(num32, _MM_SHUFFLE(1,0,3,2)));
			num32 = _mm256_add_epi32(num32,
				_mm256_permute4x64_epi64(num32, _MM_SHUFFLE(0,0,0,1)));
			num += _mm_cvtsi128_si32(_mm256_castsi256_si128(num32));
			// reset
			sum32 = num32 = zero;
			limit_by_U8 = 0;
		}
	}

#elif defined(COREARRAY_SIMD_SSE2)

	// header, 16-byte aligned
	size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F;
	for (; (n > 0) && (h > 0); n--, h--, p++)
		if (*p <= 2) { sum += *p; num++; }

	const __m128i three = _mm_set1_epi8(3);
	const __m128i zero = _mm_setzero_si128();
	__m128i sum16=zero, num16=zero;
	size_t limit_by_U8 = 0;

	for (; n >= 16; )
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		p += 16;
		__m128i m = _mm_cmpgt_epi8(three, _mm_min_epu8(v, three));
		sum16 = _mm_add_epi8(sum16, v & m);
		num16 = _mm_sub_epi8(num16, m);
		n -= 16;
		limit_by_U8 ++;
		if ((limit_by_U8 >= 127) || (n < 16))
		{
			// add to sum
			sum16 = _mm_sad_epu8(sum16, zero);
			sum += _mm_cvtsi128_si32(sum16);
			sum += _mm_cvtsi128_si32(_mm_shuffle_epi32(sum16, 2));
			// add to num
			num16 = _mm_sad_epu8(num16, zero);
			num += _mm_cvtsi128_si32(num16);
			num += _mm_cvtsi128_si32(_mm_shuffle_epi32(num16, 2));
			// reset
			sum16 = num16 = zero;
			limit_by_U8 = 0;
		}
	}

#endif

	for (; n > 0; n--, p++)
		if (*p <= 2) { sum += *p; num++; }
	out_sum = sum;
	out_num = num;
	return p;
}
Esempio n. 11
0
void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
		uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps)
{
	const uint32_t default_partition_samples = (residual_samples + predictor_order) >> max_partition_order;
	uint32_t partitions = 1u << max_partition_order;

	FLAC__ASSERT(default_partition_samples > predictor_order);

	/* first do max_partition_order */
	{
		const uint32_t threshold = 32 - FLAC__bitmath_ilog2(default_partition_samples);
		uint32_t partition, residual_sample, end = (uint32_t)(-(int32_t)predictor_order);

		if(bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < threshold) {
			for(partition = residual_sample = 0; partition < partitions; partition++) {
				__m128i mm_sum = _mm_setzero_si128();
				uint32_t e1, e3;
				end += default_partition_samples;

				e1 = (residual_sample + 3) & ~3; e3 = end & ~3;
				if(e1 > end)
					e1 = end; /* try flac -l 1 -b 16 and you'll be here */

				/* assumption: residual[] is properly aligned so (residual + e1) is properly aligned too and _mm_loadu_si128() is fast */
				for( ; residual_sample < e1; residual_sample++) {
					__m128i mm_res = _mm_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample]));
					mm_sum = _mm_add_epi32(mm_sum, mm_res);
				}

				for( ; residual_sample < e3; residual_sample+=4) {
					__m128i mm_res = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample)));
					mm_sum = _mm_add_epi32(mm_sum, mm_res);
				}

				for( ; residual_sample < end; residual_sample++) {
					__m128i mm_res = _mm_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample]));
					mm_sum = _mm_add_epi32(mm_sum, mm_res);
				}

				mm_sum = _mm_add_epi32(mm_sum, _mm_shuffle_epi32(mm_sum, _MM_SHUFFLE(1,0,3,2)));
				mm_sum = _mm_add_epi32(mm_sum, _mm_shufflelo_epi16(mm_sum, _MM_SHUFFLE(1,0,3,2)));
				abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(mm_sum);
/* workaround for MSVC bugs (at least versions 2015 and 2017 are affected) */
#if (defined _MSC_VER) && (defined FLAC__CPU_X86_64)
				abs_residual_partition_sums[partition] &= 0xFFFFFFFF;
#endif
			}
		}
		else { /* have to pessimistically use 64 bits for accumulator */
			for(partition = residual_sample = 0; partition < partitions; partition++) {
				__m128i mm_sum = _mm_setzero_si128();
				uint32_t e1, e3;
				end += default_partition_samples;

				e1 = (residual_sample + 1) & ~1; e3 = end & ~1;
				FLAC__ASSERT(e1 <= end);

				for( ; residual_sample < e1; residual_sample++) {
					__m128i mm_res = _mm_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample])); /*  0   0   0  |r0|  ==   00   |r0_64| */
					mm_sum = _mm_add_epi64(mm_sum, mm_res);
				}

				for( ; residual_sample < e3; residual_sample+=2) {
					__m128i mm_res = _mm_abs_epi32(_mm_loadl_epi64((const __m128i*)(residual+residual_sample))); /*  0   0  |r1|   |r0| */
					mm_res = _mm_shuffle_epi32(mm_res, _MM_SHUFFLE(3,1,2,0)); /* 0  |r1|  0  |r0|  ==  |r1_64|  |r0_64|  */
					mm_sum = _mm_add_epi64(mm_sum, mm_res);
				}

				for( ; residual_sample < end; residual_sample++) {
					__m128i mm_res = _mm_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample]));
					mm_sum = _mm_add_epi64(mm_sum, mm_res);
				}

				mm_sum = _mm_add_epi64(mm_sum, _mm_srli_si128(mm_sum, 8));
				_mm_storel_epi64((__m128i*)(abs_residual_partition_sums+partition), mm_sum);
			}
		}
	}

	/* now merge partitions for lower orders */
	{
		uint32_t from_partition = 0, to_partition = partitions;
		int partition_order;
		for(partition_order = (int)max_partition_order - 1; partition_order >= (int)min_partition_order; partition_order--) {
			uint32_t i;
			partitions >>= 1;
			for(i = 0; i < partitions; i++) {
				abs_residual_partition_sums[to_partition++] =
					abs_residual_partition_sums[from_partition  ] +
					abs_residual_partition_sums[from_partition+1];
				from_partition += 2;
			}
		}
	}
}
Esempio n. 12
0
// Simple quantization
static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
                             int n, const VP8Matrix* const mtx) {
  const __m128i max_coeff_2047 = _mm_set1_epi16(2047);
  const __m128i zero = _mm_set1_epi16(0);
  __m128i sign0, sign8;
  __m128i coeff0, coeff8;
  __m128i out0, out8;
  __m128i packed_out;

  // Load all inputs.
  // TODO(cduvivier): Make variable declarations and allocations aligned so that
  //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
  const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);
  const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);
  const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
  const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
  const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
  const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
  const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
  const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
  const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);
  const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);

  // sign(in) = in >> 15  (0x0000 if positive, 0xffff if negative)
  sign0 = _mm_srai_epi16(in0, 15);
  sign8 = _mm_srai_epi16(in8, 15);

  // coeff = abs(in) = (in ^ sign) - sign
  coeff0 = _mm_xor_si128(in0, sign0);
  coeff8 = _mm_xor_si128(in8, sign8);
  coeff0 = _mm_sub_epi16(coeff0, sign0);
  coeff8 = _mm_sub_epi16(coeff8, sign8);

  // coeff = abs(in) + sharpen
  coeff0 = _mm_add_epi16(coeff0, sharpen0);
  coeff8 = _mm_add_epi16(coeff8, sharpen8);

  // if (coeff > 2047) coeff = 2047
  coeff0 = _mm_min_epi16(coeff0, max_coeff_2047);
  coeff8 = _mm_min_epi16(coeff8, max_coeff_2047);

  // out = (coeff * iQ + B) >> QFIX;
  {
    // doing calculations with 32b precision (QFIX=17)
    // out = (coeff * iQ)
    __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
    __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
    __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
    __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
    // expand bias from 16b to 32b
    __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero);
    __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero);
    __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero);
    __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero);
    // out = (coeff * iQ + B)
    out_00 = _mm_add_epi32(out_00, bias_00);
    out_04 = _mm_add_epi32(out_04, bias_04);
    out_08 = _mm_add_epi32(out_08, bias_08);
    out_12 = _mm_add_epi32(out_12, bias_12);
    // out = (coeff * iQ + B) >> QFIX;
    out_00 = _mm_srai_epi32(out_00, QFIX);
    out_04 = _mm_srai_epi32(out_04, QFIX);
    out_08 = _mm_srai_epi32(out_08, QFIX);
    out_12 = _mm_srai_epi32(out_12, QFIX);
    // pack result as 16b
    out0 = _mm_packs_epi32(out_00, out_04);
    out8 = _mm_packs_epi32(out_08, out_12);
  }

  // get sign back (if (sign[j]) out_n = -out_n)
  out0 = _mm_xor_si128(out0, sign0);
  out8 = _mm_xor_si128(out8, sign8);
  out0 = _mm_sub_epi16(out0, sign0);
  out8 = _mm_sub_epi16(out8, sign8);

  // in = out * Q
  in0 = _mm_mullo_epi16(out0, q0);
  in8 = _mm_mullo_epi16(out8, q8);

  // if (coeff <= mtx->zthresh_) {in=0; out=0;}
  {
    __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0);
    __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8);
    in0 = _mm_and_si128(in0, cmp0);
    in8 = _mm_and_si128(in8, cmp8);
    _mm_storeu_si128((__m128i*)&in[0], in0);
    _mm_storeu_si128((__m128i*)&in[8], in8);
    out0 = _mm_and_si128(out0, cmp0);
    out8 = _mm_and_si128(out8, cmp8);
  }

  // zigzag the output before storing it.
  //
  // The zigzag pattern can almost be reproduced with a small sequence of
  // shuffles. After it, we only need to swap the 7th (ending up in third
  // position instead of twelfth) and 8th values.
  {
    __m128i outZ0, outZ8;
    outZ0 = _mm_shufflehi_epi16(out0,  _MM_SHUFFLE(2, 1, 3, 0));
    outZ0 = _mm_shuffle_epi32  (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
    outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2));
    outZ8 = _mm_shufflelo_epi16(out8,  _MM_SHUFFLE(3, 0, 2, 1));
    outZ8 = _mm_shuffle_epi32  (outZ8, _MM_SHUFFLE(3, 1, 2, 0));
    outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0));
    _mm_storeu_si128((__m128i*)&out[0], outZ0);
    _mm_storeu_si128((__m128i*)&out[8], outZ8);
    packed_out = _mm_packs_epi16(outZ0, outZ8);
  }
  {
    const int16_t outZ_12 = out[12];
    const int16_t outZ_3 = out[3];
    out[3] = outZ_12;
    out[12] = outZ_3;
  }

  // detect if all 'out' values are zeroes or not
  {
    int32_t tmp[4];
    _mm_storeu_si128((__m128i*)tmp, packed_out);
    if (n) {
      tmp[0] &= ~0xff;
    }
    return (tmp[3] || tmp[2] || tmp[1] || tmp[0]);
  }
}
Esempio n. 13
0
static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
                           int16_t* out) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i seven = _mm_set1_epi16(7);
  const __m128i k7500 = _mm_set1_epi32(7500);
  const __m128i k14500 = _mm_set1_epi32(14500);
  const __m128i k51000 = _mm_set1_epi32(51000);
  const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
                                           5352,  2217, 5352,  2217);
  const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
                                           2217, -5352, 2217, -5352);

  __m128i v01, v32;

  // Difference between src and ref and initial transpose.
  {
    // Load src and convert to 16b.
    const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]);
    const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]);
    const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]);
    const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]);
    const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
    const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
    const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
    const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
    // Load ref and convert to 16b.
    const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
    const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
    const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
    const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
    const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
    const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
    const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
    const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
    // Compute difference.
    const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
    const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
    const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
    const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);

    // Transpose.
    // 00 01 02 03   0 0 0 0
    // 10 11 12 13   0 0 0 0
    // 20 21 22 23   0 0 0 0
    // 30 31 32 33   0 0 0 0
    const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1);
    const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3);
    // 00 10 01 11   02 12 03 13
    // 20 30 21 31   22 32 23 33
    const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
    // a02 a12 a22 a32   a03 a13 a23 a33
    // a00 a10 a20 a30   a01 a11 a21 a31
    // a03 a13 a23 a33   a02 a12 a22 a32
  }

  // First pass and subsequent transpose.
  {
    // Same operations are done on the (0,3) and (1,2) pairs.
    // b0 = (a0 + a3) << 3
    // b1 = (a1 + a2) << 3
    // b3 = (a0 - a3) << 3
    // b2 = (a1 - a2) << 3
    const __m128i a01 = _mm_add_epi16(v01, v32);
    const __m128i a32 = _mm_sub_epi16(v01, v32);
    const __m128i b01 = _mm_slli_epi16(a01, 3);
    const __m128i b32 = _mm_slli_epi16(a32, 3);
    const __m128i b11 = _mm_unpackhi_epi64(b01, b01);
    const __m128i b22 = _mm_unpackhi_epi64(b32, b32);

    // e0 = b0 + b1
    // e2 = b0 - b1
    const __m128i e0 = _mm_add_epi16(b01, b11);
    const __m128i e2 = _mm_sub_epi16(b01, b11);
    const __m128i e02 = _mm_unpacklo_epi64(e0, e2);

    // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12
    // e3 = (b3 * 2217 - b2 * 5352 +  7500) >> 12
    const __m128i b23 = _mm_unpacklo_epi16(b22, b32);
    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
    const __m128i d1 = _mm_add_epi32(c1, k14500);
    const __m128i d3 = _mm_add_epi32(c3, k7500);
    const __m128i e1 = _mm_srai_epi32(d1, 12);
    const __m128i e3 = _mm_srai_epi32(d3, 12);
    const __m128i e13 = _mm_packs_epi32(e1, e3);

    // Transpose.
    // 00 01 02 03  20 21 22 23
    // 10 11 12 13  30 31 32 33
    const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13);
    const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13);
    // 00 10 01 11   02 12 03 13
    // 20 30 21 31   22 32 23 33
    const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
    // 02 12 22 32   03 13 23 33
    // 00 10 20 30   01 11 21 31
    // 03 13 23 33   02 12 22 32
  }

  // Second pass
  {
    // Same operations are done on the (0,3) and (1,2) pairs.
    // a0 = v0 + v3
    // a1 = v1 + v2
    // a3 = v0 - v3
    // a2 = v1 - v2
    const __m128i a01 = _mm_add_epi16(v01, v32);
    const __m128i a32 = _mm_sub_epi16(v01, v32);
    const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
    const __m128i a22 = _mm_unpackhi_epi64(a32, a32);

    // d0 = (a0 + a1 + 7) >> 4;
    // d2 = (a0 - a1 + 7) >> 4;
    const __m128i b0 = _mm_add_epi16(a01, a11);
    const __m128i b2 = _mm_sub_epi16(a01, a11);
    const __m128i c0 = _mm_add_epi16(b0, seven);
    const __m128i c2 = _mm_add_epi16(b2, seven);
    const __m128i d0 = _mm_srai_epi16(c0, 4);
    const __m128i d2 = _mm_srai_epi16(c2, 4);

    // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
    // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
    const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
    const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
    const __m128i d3 = _mm_add_epi32(c3, k51000);
    const __m128i e1 = _mm_srai_epi32(d1, 16);
    const __m128i e3 = _mm_srai_epi32(d3, 16);
    const __m128i f1 = _mm_packs_epi32(e1, e1);
    const __m128i f3 = _mm_packs_epi32(e3, e3);
    // f1 = f1 + (a3 != 0);
    // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
    // desired (0, 1), we add one earlier through k12000_plus_one.
    const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));

    _mm_storel_epi64((__m128i*)&out[ 0], d0);
    _mm_storel_epi64((__m128i*)&out[ 4], g1);
    _mm_storel_epi64((__m128i*)&out[ 8], d2);
    _mm_storel_epi64((__m128i*)&out[12], f3);
  }
}