// function that implements the kernel of the seismic modeling algorithm void seismic_exec(float **VEL, float **PPF, float **APF, float **NPF, float* seismicPulseVector, int spPosX, int spPosY, int xDim, int yDim, int timeSteps) { int i,j; // spatial loops counters int t; // time loop counter #ifdef _VERBOSE int progressTimer = -1; #endif // make sure packing _all_ the data into sets of 4 element is ok assert( xDim % 4 == 0 ); #ifdef _VERBOSE printf("processing...\n"); printf("point of explosion = %d, %d\n", spPosX, spPosY); #endif // there are 16 XMM registers in 64 bit mode, so there is no need to spill to stack __m128 s_ppf, s_vel, s_actual, s_above1, s_left1, s_under1, s_right1, s_two, s_sixteen, s_sixty; __m128 s_above2, s_under2, s_left2, s_right2; float two[4] = {2.0f, 2.0f, 2.0f, 2.0f }; float sixteen[4] = {16.0f,16.0f,16.0f,16.0f}; float sixty[4] = {60.f,60.f,60.f,60.f}; // preload XMM registers with constant values. s_two = _mm_load_ps( two ); s_sixteen = _mm_load_ps( sixteen ); s_sixty = _mm_load_ps( sixty ); // time loop for (t = 0; t < timeSteps; t++) { #ifdef _VVERBOSE printf("----------------------------------------------\ntimestep: %d\n\n", t ); #endif // add pulse APF[spPosY][spPosX] += seismicPulseVector[t]; for(i=2; i<(yDim-2); i++) { for(j=2 + ALIGNMENT_OFFSET; j<(xDim-2); j+=4) { s_ppf = _mm_load_ps( &(PPF[i][j]) ); s_vel = _mm_load_ps( &(VEL[i][j]) ); s_actual = _mm_load_ps( &(APF[i][j]) ); s_left1 = _mm_load_ps( &(APF[i-1][j]) ); s_left2 = _mm_load_ps( &(APF[i-2][j]) ); s_right2 = _mm_load_ps( &(APF[i+2][j]) ); s_right1 = _mm_load_ps( &(APF[i+1][j]) ); s_above1 = _mm_loadu_ps( &(APF[i][j-1]) ); s_under1 = _mm_loadu_ps( &(APF[i][j+1]) ); s_above2 = _mm_loadl_pi( _mm_shuffle_ps(s_actual, s_actual, _MM_SHUFFLE(1, 0, 0, 0)), &(APF[i][j-2])); s_under2 = _mm_loadh_pi( _mm_shuffle_ps(s_actual, s_actual, _MM_SHUFFLE(0, 0, 3, 2)), &(APF[i][j+4])); // sum elements with an offset of one s_under1 = _mm_add_ps( s_under1, _mm_add_ps( s_above1, _mm_add_ps( s_left1, s_right1))); // sum elements with an offset of two s_above2 = _mm_add_ps( s_left2, _mm_add_ps( s_right2, _mm_add_ps( s_under2, s_above2))); // multiply with 16 s_under1 = _mm_mul_ps( s_sixteen, s_under1 ); // s_under1 = _mm_sub_ps( _mm_sub_ps( s_under1, s_above2), _mm_mul_ps( s_sixty, s_actual ) ); s_under1 = _mm_add_ps( _mm_mul_ps( s_vel, s_under1), _mm_sub_ps(_mm_mul_ps( s_two, s_actual ), s_ppf) ); // save the result _mm_store_ps( &(NPF[i][j]), s_under1); #ifdef _VVERBOSE printf("[%d][%d]\n", i, j); #endif } #ifdef _VVERBOSE printf("\n"); #endif } #ifdef _VERBOSE // shows one # at each 10% of the total processing time if (t/(timeSteps/10) > progressTimer ) { printf("#"); progressTimer++; fflush(stdout); } #endif // switch pointers instead of copying data PPF = APF; APF = NPF; NPF = PPF; } #ifdef _VERBOSE printf("\nend process!\n"); #endif }
int main() { #ifndef __EMSCRIPTEN__ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); #endif printf ("{ \"workload\": %u, \"results\": [\n", N); assert(N%2 == 0); // Don't care about the tail for now. double *src = get_src_d();//(float*)aligned_alloc(16, N*sizeof(float)); for(int i = 0; i < N; ++i) src[i] = (double)rand() / RAND_MAX; double *src2 = get_src2_d();//(float*)aligned_alloc(16, N*sizeof(float)); for(int i = 0; i < N; ++i) src2[i] = (double)rand() / RAND_MAX; double *dst = get_dst_d();//(float*)aligned_alloc(16, N*sizeof(float)); float scalarTime; SETCHART("load"); START(); for(int i = 0; i < N; ++i) dst[i] = src[i]; ENDSCALAR(checksum_dst(dst), "scalar"); LS_TEST("_mm_load_pd", _mm_load_pd, 0, _mm_store_pd, double*, 0, 2); LS_TEST("_mm_load_pd1", _mm_load_pd1, 1, _mm_store_pd, double*, 0, 2); LS_TEST("_mm_load_sd", _mm_load_sd, 1, _mm_store_pd, double*, 0, 2); // _mm_load_si128 LS_TEST("_mm_load1_pd", _mm_load1_pd, 1, _mm_store_pd, double*, 0, 2); __m128d tempReg = _mm_set_pd(1.0, 2.0); LSH_TEST("_mm_loadh_pd", tempReg, _mm_loadh_pd, double*, 1, _mm_store_pd, double*, 0, 2); // _mm_loadl_epi64 LSH_TEST("_mm_loadl_pd", tempReg, _mm_loadh_pd, double*, 1, _mm_store_pd, double*, 0, 2); LS_TEST("_mm_loadr_pd", _mm_loadr_pd, 0, _mm_store_pd, double*, 0, 2); LS_TEST("_mm_loadu_pd", _mm_loadu_pd, 1, _mm_store_pd, double*, 0, 2); // _mm_loadu_si128 SETCHART("set"); /* _mm_set_epi16 _mm_set_epi32 _mm_set_epi64 _mm_set_epi64x _mm_set_epi8 */ SS_TEST_D("_mm_set_pd", _mm_set_pd(src[i+2], src[i+0])); //SS_TEST_D("_mm_set_pd1", _mm_set_pd1(src[i])); SS_TEST_D("_mm_set_sd", _mm_set_sd(src[i])); /* _mm_set1_epi16 _mm_set1_epi32 _mm_set1_epi64 _mm_set1_epi64x _mm_set1_epi8 */ SS_TEST_D("_mm_set1_pd", _mm_set1_pd(src[i])); /* _mm_setr_epi16 _mm_setr_epi32 _mm_setr_epi64 _mm_setr_epi8 */ SS_TEST_D("_mm_setr_pd", _mm_set_pd(src[i+2], src[i+0])); SS_TEST_D("_mm_setzero_pd", _mm_setzero_pd()); // _mm_setzero_si128 SETCHART("move"); // _mm_move_epi64 SS_TEST_D("_mm_move_sd", _mm_move_sd(_mm_load_pd(src+i), _mm_load_pd(src2+i))); SETCHART("store"); // _mm_maskmoveu_si128 LS_TEST("_mm_store_pd", _mm_load_pd, 0, _mm_store_pd, double*, 0, 2); // LS_TEST("_mm_store_pd1", _mm_load_pd, 0, _mm_store_pd1, double*, 0); LS_TEST("_mm_store_sd", _mm_load_pd, 0, _mm_store_sd, double*, 1, 2); // _mm_store_si128 // _mm_store1_pd LS64_TEST("_mm_storeh_pi", _mm_load_pd, 0, _mm_storeh_pi, 1, 2); // _mm_storel_epi64 LS64_TEST("_mm_storel_pi", _mm_load_pd, 0, _mm_storel_pi, 1, 2); LS_TEST("_mm_storer_pd", _mm_load_pd, 0, _mm_storer_pd, double*, 0, 2); LS_TEST("_mm_storeu_pd", _mm_load_pd, 0, _mm_storeu_pd, double*, 1, 2); // _mm_storeu_si128 LS_TEST("_mm_stream_pd", _mm_load_pd, 0, _mm_stream_pd, double*, 0, 2); // _mm_stream_si128 // _mm_stream_si32 // _mm_stream_si64 SETCHART("arithmetic"); // _mm_add_epi16 // _mm_add_epi32 // _mm_add_epi64 // _mm_add_epi8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] += src2[0]; dst[1] += src2[1]; dst[2] += src2[2]; dst[3] += src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar add"); BINARYOP_TEST_D("_mm_add_pd", _mm_add_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_add_sd", _mm_add_sd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_adds_epi16 // _mm_adds_epi8 // _mm_adds_epu16 // _mm_adds_epu8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] /= src2[0]; dst[1] /= src2[1]; dst[2] /= src2[2]; dst[3] /= src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar div"); BINARYOP_TEST_D("_mm_div_pd", _mm_div_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_div_sd", _mm_div_sd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_madd_epi16 // _mm_mul_epu32 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] *= src2[0]; dst[1] *= src2[1]; dst[2] *= src2[2]; dst[3] *= src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar mul"); BINARYOP_TEST_D("_mm_mul_pd", _mm_mul_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_mul_sd", _mm_mul_sd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_mulhi_epi16 // _mm_mulhi_epu16 // _mm_mullo_epi16 // _mm_sad_epu8 // _mm_sub_epi16 // _mm_sub_epi32 // _mm_sub_epi64 // _mm_sub_epi8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] -= src2[0]; dst[1] -= src2[1]; dst[2] -= src2[2]; dst[3] -= src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar sub"); BINARYOP_TEST_D("_mm_sub_pd", _mm_sub_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_sub_sd", _mm_sub_sd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_subs_epi16 // _mm_subs_epi8 // _mm_subs_epu16 // _mm_subs_epu8 SETCHART("roots"); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = sqrt(dst[0]); dst[1] = sqrt(dst[1]); dst[2] = sqrt(dst[2]); dst[3] = sqrt(dst[3]); } ENDSCALAR(checksum_dst(dst), "scalar sqrt"); UNARYOP_TEST_D("_mm_sqrt_pd", _mm_sqrt_pd, _mm_load_pd(src)); // UNARYOP_TEST_D("_mm_sqrt_sd", _mm_sqrt_sd, _mm_load_pd(src)); SETCHART("logical"); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastd(dcastu(dst[0]) & dcastu(src2[0])); dst[1] = ucastd(dcastu(dst[1]) & dcastu(src2[1])); dst[2] = ucastd(dcastu(dst[2]) & dcastu(src2[2])); dst[3] = ucastd(dcastu(dst[3]) & dcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar and"); BINARYOP_TEST_D("_mm_and_pd", _mm_and_pd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_and_si128 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastd((~dcastu(dst[0])) & dcastu(src2[0])); dst[1] = ucastd((~dcastu(dst[1])) & dcastu(src2[1])); dst[2] = ucastd((~dcastu(dst[2])) & dcastu(src2[2])); dst[3] = ucastd((~dcastu(dst[3])) & dcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar andnot"); BINARYOP_TEST_D("_mm_andnot_pd", _mm_andnot_pd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_andnot_si128 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastd(dcastu(dst[0]) | dcastu(src2[0])); dst[1] = ucastd(dcastu(dst[1]) | dcastu(src2[1])); dst[2] = ucastd(dcastu(dst[2]) | dcastu(src2[2])); dst[3] = ucastd(dcastu(dst[3]) | dcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar or"); BINARYOP_TEST_D("_mm_or_pd", _mm_or_pd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_or_si128 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastd(dcastu(dst[0]) ^ dcastu(src2[0])); dst[1] = ucastd(dcastu(dst[1]) ^ dcastu(src2[1])); dst[2] = ucastd(dcastu(dst[2]) ^ dcastu(src2[2])); dst[3] = ucastd(dcastu(dst[3]) ^ dcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar xor"); BINARYOP_TEST_D("_mm_xor_pd", _mm_xor_pd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_xor_si128 SETCHART("cmp"); // _mm_cmpeq_epi16 // _mm_cmpeq_epi32 // _mm_cmpeq_epi8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] == src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] == src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] == src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] == src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp=="); BINARYOP_TEST_D("_mm_cmpeq_pd", _mm_cmpeq_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_cmpeq_sd", _mm_cmpeq_sd, _mm_load_pd(src), _mm_load_pd(src2)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] >= src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] >= src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] >= src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] >= src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp>="); BINARYOP_TEST_D("_mm_cmpge_pd", _mm_cmpge_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_cmpge_sd", _mm_cmpge_sd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_cmpgt_epi16 // _mm_cmpgt_epi32 // _mm_cmpgt_epi8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] > src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] > src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] > src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] > src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp>"); BINARYOP_TEST_D("_mm_cmpgt_pd", _mm_cmpgt_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_cmpgt_sd", _mm_cmpgt_sd, _mm_load_pd(src), _mm_load_pd(src2)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] <= src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] <= src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] <= src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] <= src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp<="); BINARYOP_TEST_D("_mm_cmple_pd", _mm_cmple_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_cmple_sd", _mm_cmple_sd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_cmplt_epi16 // _mm_cmplt_epi32 // _mm_cmplt_epi8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] < src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] < src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] < src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] < src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp<"); BINARYOP_TEST_D("_mm_cmplt_pd", _mm_cmplt_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_cmplt_sd", _mm_cmplt_sd, _mm_load_pd(src), _mm_load_pd(src2)); /*_mm_cmpneq_pd _mm_cmpneq_sd _mm_cmpnge_pd _mm_cmpnge_sd _mm_cmpngt_pd _mm_cmpngt_sd _mm_cmpnle_pd _mm_cmpnle_sd _mm_cmpnlt_pd _mm_cmpnlt_sd*/ START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (!Isnan(dst[0]) && !Isnan(src2[0])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (!Isnan(dst[1]) && !Isnan(src2[1])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (!Isnan(dst[2]) && !Isnan(src2[2])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (!Isnan(dst[3]) && !Isnan(src2[3])) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmpord"); BINARYOP_TEST_D("_mm_cmpord_pd", _mm_cmpord_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_cmpord_sd", _mm_cmpord_sd, _mm_load_pd(src), _mm_load_pd(src2)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (Isnan(dst[0]) || Isnan(src2[0])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (Isnan(dst[1]) || Isnan(src2[1])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (Isnan(dst[2]) || Isnan(src2[2])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (Isnan(dst[3]) || Isnan(src2[3])) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmpunord"); BINARYOP_TEST_D("_mm_cmpunord_pd", _mm_cmpunord_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_cmpunord_sd", _mm_cmpunord_sd, _mm_load_pd(src), _mm_load_pd(src2)); SETCHART("max"); // _mm_max_epi16 // _mm_max_epu8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = Max(dst[0], src2[0]); dst[1] = Max(dst[1], src2[1]); dst[2] = Max(dst[2], src2[2]); dst[3] = Max(dst[3], src2[3]); } ENDSCALAR(checksum_dst(dst), "scalar max"); BINARYOP_TEST_D("_mm_max_pd", _mm_max_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_max_sd", _mm_max_sd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_min_epi16 // _mm_min_epu8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = Min(dst[0], src2[0]); dst[1] = Min(dst[1], src2[1]); dst[2] = Min(dst[2], src2[2]); dst[3] = Min(dst[3], src2[3]); } ENDSCALAR(checksum_dst(dst), "scalar min"); BINARYOP_TEST_D("_mm_min_pd", _mm_min_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_min_sd", _mm_min_sd, _mm_load_pd(src), _mm_load_pd(src2)); SETCHART("shuffle"); // _mm_extract_epi16 // _mm_insert_epi16 // _mm_shuffle_epi32 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[3] = dst[1]; dst[2] = dst[0]; dst[1] = src2[3]; dst[0] = src2[2]; } ENDSCALAR(checksum_dst(dst), "scalar shuffle"); // BINARYOP_TEST_D("_mm_shuffle_pd", _mm_shuffle_pd, _mm_load_pd(src), _mm_load_pd(src2)); START(); __m128 o0 = _mm_load_pd(src); __m128 o1 = _mm_load_pd(src2); for(int i = 0; i < N; i += 4) o0 = _mm_shuffle_pd(o0, o1, _MM_SHUFFLE(1, 0, 3, 2)); _mm_store_pd(dst, o0); END(checksum_dst(dst), "_mm_shuffle_pd"); // _mm_shufflehi_epi16 // _mm_shufflelo_epi16 // _mm_unpackhi_epi16 // _mm_unpackhi_epi32 // _mm_unpackhi_epi64 // _mm_unpackhi_epi8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = dst[2]; dst[1] = src2[2]; dst[2] = dst[3]; dst[3] = src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar unpackhi_pd"); BINARYOP_TEST_D("_mm_unpackhi_pd", _mm_unpackhi_pd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_unpacklo_epi16 // _mm_unpacklo_epi32 // _mm_unpacklo_epi64 // _mm_unpacklo_epi8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[2] = dst[1]; dst[1] = dst[0]; dst[0] = src2[0]; dst[3] = src2[1]; } ENDSCALAR(checksum_dst(dst), "scalar unpacklo_pd"); BINARYOP_TEST_D("_mm_unpacklo_pd", _mm_unpacklo_pd, _mm_load_pd(src), _mm_load_pd(src2)); printf("]}\n"); /* printf("Finished!\n"); printf("Total time spent in scalar intrinsics: %f msecs.\n", (double)scalarTotalTicks * 1000.0 / ticks_per_sec()); printf("Total time spent in SSE1 intrinsics: %f msecs.\n", (double)simdTotalTicks * 1000.0 / ticks_per_sec()); if (scalarTotalTicks > simdTotalTicks) printf("SSE1 was %.3fx faster than scalar!\n", (double)scalarTotalTicks / simdTotalTicks); else printf("SSE1 was %.3fx slower than scalar!\n", (double)simdTotalTicks / scalarTotalTicks); */ #ifdef __EMSCRIPTEN__ fprintf(stderr,"User Agent: %s\n", emscripten_run_script_string("navigator.userAgent")); printf("/*Test finished! Now please close Firefox to continue with benchmark_sse2.py.*/\n"); #endif exit(0); }
OD_SIMD_INLINE int32_t od_mc_compute_satd8_8x8_part(const unsigned char *src, int systride, const unsigned char *ref, int rystride) { int32_t satd; __m128i sums; __m128i a; __m128i b; __m128i c; __m128i d; __m128i e; __m128i f; __m128i g; __m128i h; a = od_load_convert_subtract_x8(src + 0*systride, ref + 0*rystride); b = od_load_convert_subtract_x8(src + 1*systride, ref + 1*rystride); c = od_load_convert_subtract_x8(src + 2*systride, ref + 2*rystride); d = od_load_convert_subtract_x8(src + 3*systride, ref + 3*rystride); e = od_load_convert_subtract_x8(src + 4*systride, ref + 4*rystride); f = od_load_convert_subtract_x8(src + 5*systride, ref + 5*rystride); g = od_load_convert_subtract_x8(src + 6*systride, ref + 6*rystride); h = od_load_convert_subtract_x8(src + 7*systride, ref + 7*rystride); /*Vertical 1D transform.*/ od_mc_butterfly_2x2_16x8(&a, &b, &c, &d); od_mc_butterfly_2x2_16x8(&e, &f, &g, &h); od_mc_butterfly_2x2_16x8(&a, &b, &e, &f); od_mc_butterfly_2x2_16x8(&c, &d, &g, &h); od_mc_butterfly_2x2_16x8(&a, &b, &e, &f); od_mc_butterfly_2x2_16x8(&c, &d, &g, &h); od_transpose16x8(&a, &c, &b, &d, &e, &g, &f, &h); /*Horizontal 1D transform.*/ od_mc_butterfly_2x2_16x8(&a, &b, &c, &d); od_mc_butterfly_2x2_16x8(&e, &f, &g, &h); od_mc_butterfly_2x2_16x8(&a, &b, &e, &f); od_mc_butterfly_2x2_16x8(&c, &d, &g, &h); /*Use the fact that (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) to merge the final butterfly stage with the calculating the absolute values and the first stage of accumulation. Calculates (abs(a+b)+abs(a-b))/2-0x7FFF. An offset must be added to the final sum before rounding to account for subtracting 0x7FFF.*/ a = _mm_sub_epi16(_mm_max_epi16(a, b), _mm_adds_epi16(_mm_add_epi16(a, b), _mm_set1_epi16(0x7FFF))); e = _mm_sub_epi16(_mm_max_epi16(e, f), _mm_adds_epi16(_mm_add_epi16(e, f), _mm_set1_epi16(0x7FFF))); c = _mm_sub_epi16(_mm_max_epi16(c, d), _mm_adds_epi16(_mm_add_epi16(c, d), _mm_set1_epi16(0x7FFF))); g = _mm_sub_epi16(_mm_max_epi16(g, h), _mm_adds_epi16(_mm_add_epi16(g, h), _mm_set1_epi16(0x7FFF))); a = _mm_add_epi16(a, e); c = _mm_add_epi16(c, g); /*Convert to 32-bit unsigned integers and sum horizontally using madd to avoid overflowing 16-bit unsigned integers. The naively calculated max values of a and c are ((8 rows * 8 cols * 256) * 2 / 2 + 1 (offset)) * 2 or 0x8002. The actual max is lower so it is safe to use _mm_madd_epi16.*/ a = _mm_madd_epi16(a, _mm_set1_epi16(1)); c = _mm_madd_epi16(c, _mm_set1_epi16(1)); sums = _mm_add_epi32(a, c); /*Sum the elements of the vector.*/ sums = _mm_add_epi32(sums, _mm_shuffle_epi32(sums, _MM_SHUFFLE(0, 1, 2, 3))); sums = _mm_add_epi32(sums, _mm_shuffle_epi32(sums, _MM_SHUFFLE(2, 3, 0, 1))); satd = _mm_cvtsi128_si32(sums); /*Subtract the offset (32) and round.*/ satd = (satd + 2 - 32) >> 2; #if defined(OD_CHECKASM) { int32_t c_satd; c_satd = od_mc_compute_satd8_8x8_c(src, systride, ref, rystride); if (satd != c_satd) { fprintf(stderr, "od_mc_compute_satd %ix%i check failed: %i!=%i\n", 8, 8, satd, c_satd); } } #endif return satd; }
int32max ); MEMALIGN(16, __m128i cur_seed_split); MEMALIGN(16, __m128i multiplier); MEMALIGN(16, __m128i adder); MEMALIGN(16, __m128i mod_mask); MEMALIGN(16, __m128 res); MEMALIGN(16, static const unsigned int mult [4]) = {214013, 17405, 214013, 69069}; MEMALIGN(16, static const unsigned int gadd [4]) = {2531011, 10395331, 13737667, 1}; MEMALIGN(16, static const unsigned int mask [4]) = {0xFFFFFFFF, 0, 0xFFFFFFFF, 0}; adder = _mm_load_si128((__m128i*)gadd); multiplier = _mm_load_si128((__m128i*)mult); mod_mask = _mm_load_si128((__m128i*)mask); cur_seed_split = _mm_shuffle_epi32(m_sseSeed, _MM_SHUFFLE(2, 3, 0, 1)); m_sseSeed = _mm_mul_epu32(m_sseSeed, multiplier); multiplier = _mm_shuffle_epi32(multiplier, _MM_SHUFFLE(2, 3, 0, 1)); cur_seed_split = _mm_mul_epu32(cur_seed_split, multiplier); m_sseSeed = _mm_and_si128(m_sseSeed, mod_mask); cur_seed_split = _mm_and_si128(cur_seed_split, mod_mask); cur_seed_split = _mm_shuffle_epi32(cur_seed_split, _MM_SHUFFLE(2, 3, 0, 1)); m_sseSeed = _mm_or_si128(m_sseSeed, cur_seed_split); m_sseSeed = _mm_add_epi32(m_sseSeed, adder); /* adjust the value to the range requested */ res = _mm_cvtepi32_ps(m_sseSeed); if (sseresult) *sseresult = _mm_mul_ps(res, f);
void GSSetupPrimCodeGenerator::Depth() { if(!m_en.z && !m_en.f) { return; } if(!m_sel.sprite) { // GSVector4 t = dscan.p; movaps(xmm0, xmmword[edx + 16]); if(m_en.f) { // GSVector4 df = p.wwww(); movaps(xmm1, xmm0); shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); // m_env.d4.f = GSVector4i(df * 4.0f).xxzzlh(); movaps(xmm2, xmm1); mulps(xmm2, xmm3); cvttps2dq(xmm2, xmm2); pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); movdqa(xmmword[&m_env.d4.f], xmm2); for(int i = 0; i < 4; i++) { // m_env.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); movaps(xmm2, xmm1); mulps(xmm2, Xmm(4 + i)); cvttps2dq(xmm2, xmm2); pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); movdqa(xmmword[&m_env.d[i].f], xmm2); } } if(m_en.z) { // GSVector4 dz = p.zzzz(); shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); // m_env.d4.z = dz * 4.0f; movaps(xmm1, xmm0); mulps(xmm1, xmm3); movdqa(xmmword[&m_env.d4.z], xmm1); for(int i = 0; i < 4; i++) { // m_env.d[i].z = dz * m_shift[i]; movaps(xmm1, xmm0); mulps(xmm1, Xmm(4 + i)); movdqa(xmmword[&m_env.d[i].z], xmm1); } } } else { // GSVector4 p = vertices[0].p; movaps(xmm0, xmmword[ecx + 16]); if(m_en.f) { // m_env.p.f = GSVector4i(p).zzzzh().zzzz(); movaps(xmm1, xmm0); cvttps2dq(xmm1, xmm1); pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); movdqa(xmmword[&m_env.p.f], xmm1); } if(m_en.z) { // GSVector4 z = p.zzzz(); shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); if(m_sel.zoverflow) { // m_env.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); static const float half = 0.5f; movss(xmm1, dword[&half]); shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0)); mulps(xmm1, xmm0); cvttps2dq(xmm1, xmm1); pslld(xmm1, 1); cvttps2dq(xmm0, xmm0); pcmpeqd(xmm2, xmm2); psrld(xmm2, 31); pand(xmm0, xmm2); por(xmm0, xmm1); } else { // m_env.p.z = GSVector4i(z); cvttps2dq(xmm0, xmm0); } movdqa(xmmword[&m_env.p.z], xmm0); } } }
gboolean gimp_operation_normal_process_sse2 (GeglOperation *op, void *in_p, void *layer_p, void *mask_p, void *out_p, glong samples, const GeglRectangle *roi, gint level) { /* check alignment */ if ((((uintptr_t)in_p) | ((uintptr_t)layer_p) | ((uintptr_t)out_p)) & 0x0F) { return gimp_operation_normal_process (op, in_p, layer_p, mask_p, out_p, samples, roi, level); } else { GimpOperationLayerMode *layer_mode = (gpointer) op; gfloat opacity = layer_mode->opacity; gfloat *mask = mask_p; const __v4sf *v_in = (const __v4sf*) in_p; const __v4sf *v_layer = (const __v4sf*) layer_p; __v4sf *v_out = ( __v4sf*) out_p; const __v4sf one = _mm_set1_ps (1.0f); const __v4sf v_opacity = _mm_set1_ps (opacity); switch (layer_mode->real_composite_mode) { case GIMP_LAYER_COMPOSITE_UNION: case GIMP_LAYER_COMPOSITE_AUTO: while (samples--) { __v4sf rgba_in, rgba_layer, alpha; rgba_in = *v_in++; rgba_layer = *v_layer++; /* expand alpha */ alpha = (__v4sf)_mm_shuffle_epi32 ((__m128i)rgba_layer, _MM_SHUFFLE (3, 3, 3, 3)); if (mask) { __v4sf mask_alpha; /* multiply layer's alpha by the mask */ mask_alpha = _mm_set1_ps (*mask++); alpha = alpha * mask_alpha; } alpha = alpha * v_opacity; if (_mm_ucomigt_ss (alpha, _mm_setzero_ps ())) { __v4sf dst_alpha, a_term, out_pixel, out_alpha, out_pixel_rbaa; /* expand alpha */ dst_alpha = (__v4sf)_mm_shuffle_epi32 ((__m128i)rgba_in, _MM_SHUFFLE (3, 3, 3, 3)); /* a_term = dst_a * (1.0 - src_a) */ a_term = dst_alpha * (one - alpha); /* out(color) = src * src_a + dst * a_term */ out_pixel = rgba_layer * alpha + rgba_in * a_term; /* out(alpha) = 1.0 * src_a + 1.0 * a_term */ out_alpha = alpha + a_term; /* un-premultiply */ out_pixel = out_pixel / out_alpha; /* swap in the real alpha */ out_pixel_rbaa = _mm_shuffle_ps (out_pixel, out_alpha, _MM_SHUFFLE (3, 3, 2, 0)); out_pixel = _mm_shuffle_ps (out_pixel, out_pixel_rbaa, _MM_SHUFFLE (2, 1, 1, 0)); *v_out++ = out_pixel; } else { *v_out++ = rgba_in; } } break; case GIMP_LAYER_COMPOSITE_CLIP_TO_BACKDROP: while (samples--) { __v4sf rgba_in, rgba_layer, alpha; rgba_in = *v_in++; rgba_layer = *v_layer++; /* expand alpha */ alpha = (__v4sf)_mm_shuffle_epi32 ((__m128i)rgba_layer, _MM_SHUFFLE (3, 3, 3, 3)); if (mask) { __v4sf mask_alpha; /* multiply layer's alpha by the mask */ mask_alpha = _mm_set1_ps (*mask++); alpha = alpha * mask_alpha; } alpha = alpha * v_opacity; if (_mm_ucomigt_ss (alpha, _mm_setzero_ps ())) { __v4sf dst_alpha, out_pixel, out_pixel_rbaa; /* expand alpha */ dst_alpha = (__v4sf)_mm_shuffle_epi32 ((__m128i)rgba_in, _MM_SHUFFLE (3, 3, 3, 3)); /* out(color) = dst * (1 - src_a) + src * src_a */ out_pixel = rgba_in + (rgba_layer - rgba_in) * alpha; /* swap in the real alpha */ out_pixel_rbaa = _mm_shuffle_ps (out_pixel, dst_alpha, _MM_SHUFFLE (3, 3, 2, 0)); out_pixel = _mm_shuffle_ps (out_pixel, out_pixel_rbaa, _MM_SHUFFLE (2, 1, 1, 0)); *v_out++ = out_pixel; } else { *v_out++ = rgba_in; } } break; case GIMP_LAYER_COMPOSITE_CLIP_TO_LAYER: while (samples--) { __v4sf rgba_in, rgba_layer, alpha; __v4sf out_pixel, out_pixel_rbaa; rgba_in = *v_in++; rgba_layer = *v_layer++; /* expand alpha */ alpha = (__v4sf)_mm_shuffle_epi32 ((__m128i)rgba_layer, _MM_SHUFFLE (3, 3, 3, 3)); if (mask) { __v4sf mask_alpha; /* multiply layer's alpha by the mask */ mask_alpha = _mm_set1_ps (*mask++); alpha = alpha * mask_alpha; } alpha = alpha * v_opacity; if (_mm_ucomigt_ss (alpha, _mm_setzero_ps ())) { /* out(color) = src */ out_pixel = rgba_layer; } else { out_pixel = rgba_in; } /* swap in the real alpha */ out_pixel_rbaa = _mm_shuffle_ps (out_pixel, alpha, _MM_SHUFFLE (3, 3, 2, 0)); out_pixel = _mm_shuffle_ps (out_pixel, out_pixel_rbaa, _MM_SHUFFLE (2, 1, 1, 0)); *v_out++ = out_pixel; } break; case GIMP_LAYER_COMPOSITE_INTERSECTION: while (samples--) { __v4sf rgba_in, rgba_layer, alpha; __v4sf out_pixel, out_pixel_rbaa; rgba_in = *v_in++; rgba_layer = *v_layer++; /* expand alpha */ alpha = (__v4sf)_mm_shuffle_epi32 ((__m128i)rgba_layer, _MM_SHUFFLE (3, 3, 3, 3)); if (mask) { __v4sf mask_alpha; /* multiply layer's alpha by the mask */ mask_alpha = _mm_set1_ps (*mask++); alpha = alpha * mask_alpha; } alpha = alpha * v_opacity; /* multiply the alpha by in's alpha */ alpha *= (__v4sf)_mm_shuffle_epi32 ((__m128i)rgba_in, _MM_SHUFFLE (3, 3, 3, 3)); if (_mm_ucomigt_ss (alpha, _mm_setzero_ps ())) { /* out(color) = src */ out_pixel = rgba_layer; } else { out_pixel = rgba_in; } /* swap in the real alpha */ out_pixel_rbaa = _mm_shuffle_ps (out_pixel, alpha, _MM_SHUFFLE (3, 3, 2, 0)); out_pixel = _mm_shuffle_ps (out_pixel, out_pixel_rbaa, _MM_SHUFFLE (2, 1, 1, 0)); *v_out++ = out_pixel; } break; } } return TRUE; }
void GSSetupPrimCodeGenerator::Texture() { if(!m_en.t) { return; } // GSVector4 t = dscan.t; movaps(xmm0, xmmword[edx + 32]); movaps(xmm1, xmm0); mulps(xmm1, xmm3); if(m_sel.fst) { // m_env.d4.st = GSVector4i(t * 4.0f); cvttps2dq(xmm1, xmm1); movdqa(xmmword[&m_env.d4.st], xmm1); } else { // m_env.d4.stq = t * 4.0f; movaps(xmmword[&m_env.d4.stq], xmm1); } for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++) { // GSVector4 ds = t.xxxx(); // GSVector4 dt = t.yyyy(); // GSVector4 dq = t.zzzz(); movaps(xmm1, xmm0); shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j)); for(int i = 0; i < 4; i++) { // GSVector4 v = ds/dt * m_shift[i]; movaps(xmm2, xmm1); mulps(xmm2, Xmm(4 + i)); if(m_sel.fst) { // m_env.d[i].si/ti = GSVector4i(v); cvttps2dq(xmm2, xmm2); switch(j) { case 0: movdqa(xmmword[&m_env.d[i].si], xmm2); break; case 1: movdqa(xmmword[&m_env.d[i].ti], xmm2); break; } } else { // m_env.d[i].s/t/q = v; switch(j) { case 0: movaps(xmmword[&m_env.d[i].s], xmm2); break; case 1: movaps(xmmword[&m_env.d[i].t], xmm2); break; case 2: movaps(xmmword[&m_env.d[i].q], xmm2); break; } } } } }
void GSSetupPrimCodeGenerator::Color() { if(!m_en.c) { return; } if(m_sel.iip) { // GSVector4 c = dscan.c; movaps(xmm0, xmmword[edx]); movaps(xmm1, xmm0); // m_env.d4.c = GSVector4i(c * 4.0f).xzyw().ps32(); movaps(xmm2, xmm0); mulps(xmm2, xmm3); cvttps2dq(xmm2, xmm2); pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0)); packssdw(xmm2, xmm2); movdqa(xmmword[&m_env.d4.c], xmm2); // xmm3 is not needed anymore // GSVector4 dr = c.xxxx(); // GSVector4 db = c.zzzz(); shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); for(int i = 0; i < 4; i++) { // GSVector4i r = GSVector4i(dr * m_shift[i]).ps32(); movaps(xmm2, xmm0); mulps(xmm2, Xmm(4 + i)); cvttps2dq(xmm2, xmm2); packssdw(xmm2, xmm2); // GSVector4i b = GSVector4i(db * m_shift[i]).ps32(); movaps(xmm3, xmm1); mulps(xmm3, Xmm(4 + i)); cvttps2dq(xmm3, xmm3); packssdw(xmm3, xmm3); // m_env.d[i].rb = r.upl16(b); punpcklwd(xmm2, xmm3); movdqa(xmmword[&m_env.d[i].rb], xmm2); } // GSVector4 c = dscan.c; movaps(xmm0, xmmword[edx]); // not enough regs, have to reload it movaps(xmm1, xmm0); // GSVector4 dg = c.yyyy(); // GSVector4 da = c.wwww(); shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); for(int i = 0; i < 4; i++) { // GSVector4i g = GSVector4i(dg * m_shift[i]).ps32(); movaps(xmm2, xmm0); mulps(xmm2, Xmm(4 + i)); cvttps2dq(xmm2, xmm2); packssdw(xmm2, xmm2); // GSVector4i a = GSVector4i(da * m_shift[i]).ps32(); movaps(xmm3, xmm1); mulps(xmm3, Xmm(4 + i)); cvttps2dq(xmm3, xmm3); packssdw(xmm3, xmm3); // m_env.d[i].ga = g.upl16(a); punpcklwd(xmm2, xmm3); movdqa(xmmword[&m_env.d[i].ga], xmm2); } } else { // GSVector4i c = GSVector4i(vertices[0].c); movaps(xmm0, xmmword[ecx]); cvttps2dq(xmm0, xmm0); // c = c.upl16(c.zwxy()); movdqa(xmm1, xmm0); pshufd(xmm1, xmm1, _MM_SHUFFLE(1, 0, 3, 2)); punpcklwd(xmm0, xmm1); // if(!tme) c = c.srl16(7); if(m_sel.tfx == TFX_NONE) { psrlw(xmm0, 7); } // m_env.c.rb = c.xxxx(); // m_env.c.ga = c.zzzz(); movdqa(xmm1, xmm0); pshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); movdqa(xmmword[&m_env.c.rb], xmm0); movdqa(xmmword[&m_env.c.ga], xmm1); } }
unsigned FLAC__fixed_compute_best_predictor_intrin_ssse3(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1]) { FLAC__uint32 total_error_0, total_error_1, total_error_2, total_error_3, total_error_4; unsigned i, order; __m128i total_err0, total_err1, total_err2; { FLAC__int32 itmp; __m128i last_error; last_error = _mm_cvtsi32_si128(data[-1]); // 0 0 0 le0 itmp = data[-2]; last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0)); last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp)); // 0 0 le0 le1 itmp -= data[-3]; last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0)); last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp)); // 0 le0 le1 le2 itmp -= data[-3] - data[-4]; last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0)); last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp)); // le0 le1 le2 le3 total_err0 = total_err1 = _mm_setzero_si128(); for(i = 0; i < data_len; i++) { __m128i err0, err1; err0 = _mm_cvtsi32_si128(data[i]); // 0 0 0 e0 err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0)); // e0 e0 e0 e0 #if 1 /* OPT_SSE */ err1 = _mm_sub_epi32(err1, last_error); last_error = _mm_srli_si128(last_error, 4); // 0 le0 le1 le2 err1 = _mm_sub_epi32(err1, last_error); last_error = _mm_srli_si128(last_error, 4); // 0 0 le0 le1 err1 = _mm_sub_epi32(err1, last_error); last_error = _mm_srli_si128(last_error, 4); // 0 0 0 le0 err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4 #else last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8)); // le0 le1 le2+le0 le3+le1 last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4)); // le0 le1+le0 le2+le0+le1 le3+le1+le2+le0 err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4 #endif last_error = _mm_alignr_epi8(err0, err1, 4); // e0 e1 e2 e3 err0 = _mm_abs_epi32(err0); err1 = _mm_abs_epi32(err1); total_err0 = _mm_add_epi32(total_err0, err0); // 0 0 0 te0 total_err1 = _mm_add_epi32(total_err1, err1); // te1 te2 te3 te4 } } total_error_0 = _mm_cvtsi128_si32(total_err0); total_err2 = total_err1; // te1 te2 te3 te4 total_err1 = _mm_srli_si128(total_err1, 8); // 0 0 te1 te2 total_error_4 = _mm_cvtsi128_si32(total_err2); total_error_2 = _mm_cvtsi128_si32(total_err1); total_err2 = _mm_srli_si128(total_err2, 4); // 0 te1 te2 te3 total_err1 = _mm_srli_si128(total_err1, 4); // 0 0 0 te1 total_error_3 = _mm_cvtsi128_si32(total_err2); total_error_1 = _mm_cvtsi128_si32(total_err1); /* prefer higher order */ if(total_error_0 < flac_min(flac_min(flac_min(total_error_1, total_error_2), total_error_3), total_error_4)) order = 0; else if(total_error_1 < flac_min(flac_min(total_error_2, total_error_3), total_error_4)) order = 1; else if(total_error_2 < flac_min(total_error_3, total_error_4)) order = 2; else if(total_error_3 < total_error_4) order = 3; else order = 4; /* Estimate the expected number of bits per residual signal sample. */ /* 'total_error*' is linearly related to the variance of the residual */ /* signal, so we use it directly to compute E(|x|) */ FLAC__ASSERT(data_len > 0 || total_error_0 == 0); FLAC__ASSERT(data_len > 0 || total_error_1 == 0); FLAC__ASSERT(data_len > 0 || total_error_2 == 0); FLAC__ASSERT(data_len > 0 || total_error_3 == 0); FLAC__ASSERT(data_len > 0 || total_error_4 == 0); residual_bits_per_sample[0] = (FLAC__float)((total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0); residual_bits_per_sample[1] = (FLAC__float)((total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0); residual_bits_per_sample[2] = (FLAC__float)((total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0); residual_bits_per_sample[3] = (FLAC__float)((total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0); residual_bits_per_sample[4] = (FLAC__float)((total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0); return order; }
// count genotype sum and number of calls, not requiring 16-aligned p COREARRAY_DLL_DEFAULT C_UInt8* vec_u8_geno_count(C_UInt8 *p, size_t n, C_Int32 &out_sum, C_Int32 &out_num) { C_Int32 sum=0, num=0; #if defined(COREARRAY_SIMD_AVX2) const __m256i three = _mm256_set1_epi8(3); const __m256i zero = _mm256_setzero_si256(); __m256i sum32 = zero, num32 = zero; size_t limit_by_U8 = 0; for (; n >= 32; ) { __m256i v = _mm256_loadu_si256((__m256i const*)p); p += 32; __m256i m = _mm256_cmpgt_epi8(three, _mm256_min_epu8(v, three)); sum32 = _mm256_add_epi8(sum32, _mm256_and_si256(v, m)); num32 = _mm256_sub_epi8(num32, m); n -= 32; limit_by_U8 ++; if ((limit_by_U8 >= 127) || (n < 32)) { // add to sum sum32 = _mm256_sad_epu8(sum32, zero); sum32 = _mm256_add_epi32(sum32, _mm256_permute4x64_epi64(sum32, _MM_SHUFFLE(1,0,3,2))); sum32 = _mm256_add_epi32(sum32, _mm256_permute4x64_epi64(sum32, _MM_SHUFFLE(0,0,0,1))); sum += _mm_cvtsi128_si32(_mm256_castsi256_si128(sum32)); // add to num num32 = _mm256_sad_epu8(num32, zero); num32 = _mm256_add_epi32(num32, _mm256_permute4x64_epi64(num32, _MM_SHUFFLE(1,0,3,2))); num32 = _mm256_add_epi32(num32, _mm256_permute4x64_epi64(num32, _MM_SHUFFLE(0,0,0,1))); num += _mm_cvtsi128_si32(_mm256_castsi256_si128(num32)); // reset sum32 = num32 = zero; limit_by_U8 = 0; } } #elif defined(COREARRAY_SIMD_SSE2) // header, 16-byte aligned size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--, p++) if (*p <= 2) { sum += *p; num++; } const __m128i three = _mm_set1_epi8(3); const __m128i zero = _mm_setzero_si128(); __m128i sum16=zero, num16=zero; size_t limit_by_U8 = 0; for (; n >= 16; ) { __m128i v = _mm_load_si128((__m128i const*)p); p += 16; __m128i m = _mm_cmpgt_epi8(three, _mm_min_epu8(v, three)); sum16 = _mm_add_epi8(sum16, v & m); num16 = _mm_sub_epi8(num16, m); n -= 16; limit_by_U8 ++; if ((limit_by_U8 >= 127) || (n < 16)) { // add to sum sum16 = _mm_sad_epu8(sum16, zero); sum += _mm_cvtsi128_si32(sum16); sum += _mm_cvtsi128_si32(_mm_shuffle_epi32(sum16, 2)); // add to num num16 = _mm_sad_epu8(num16, zero); num += _mm_cvtsi128_si32(num16); num += _mm_cvtsi128_si32(_mm_shuffle_epi32(num16, 2)); // reset sum16 = num16 = zero; limit_by_U8 = 0; } } #endif for (; n > 0; n--, p++) if (*p <= 2) { sum += *p; num++; } out_sum = sum; out_num = num; return p; }
void FLAC__precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps) { const uint32_t default_partition_samples = (residual_samples + predictor_order) >> max_partition_order; uint32_t partitions = 1u << max_partition_order; FLAC__ASSERT(default_partition_samples > predictor_order); /* first do max_partition_order */ { const uint32_t threshold = 32 - FLAC__bitmath_ilog2(default_partition_samples); uint32_t partition, residual_sample, end = (uint32_t)(-(int32_t)predictor_order); if(bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < threshold) { for(partition = residual_sample = 0; partition < partitions; partition++) { __m128i mm_sum = _mm_setzero_si128(); uint32_t e1, e3; end += default_partition_samples; e1 = (residual_sample + 3) & ~3; e3 = end & ~3; if(e1 > end) e1 = end; /* try flac -l 1 -b 16 and you'll be here */ /* assumption: residual[] is properly aligned so (residual + e1) is properly aligned too and _mm_loadu_si128() is fast */ for( ; residual_sample < e1; residual_sample++) { __m128i mm_res = _mm_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample])); mm_sum = _mm_add_epi32(mm_sum, mm_res); } for( ; residual_sample < e3; residual_sample+=4) { __m128i mm_res = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample))); mm_sum = _mm_add_epi32(mm_sum, mm_res); } for( ; residual_sample < end; residual_sample++) { __m128i mm_res = _mm_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample])); mm_sum = _mm_add_epi32(mm_sum, mm_res); } mm_sum = _mm_add_epi32(mm_sum, _mm_shuffle_epi32(mm_sum, _MM_SHUFFLE(1,0,3,2))); mm_sum = _mm_add_epi32(mm_sum, _mm_shufflelo_epi16(mm_sum, _MM_SHUFFLE(1,0,3,2))); abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(mm_sum); /* workaround for MSVC bugs (at least versions 2015 and 2017 are affected) */ #if (defined _MSC_VER) && (defined FLAC__CPU_X86_64) abs_residual_partition_sums[partition] &= 0xFFFFFFFF; #endif } } else { /* have to pessimistically use 64 bits for accumulator */ for(partition = residual_sample = 0; partition < partitions; partition++) { __m128i mm_sum = _mm_setzero_si128(); uint32_t e1, e3; end += default_partition_samples; e1 = (residual_sample + 1) & ~1; e3 = end & ~1; FLAC__ASSERT(e1 <= end); for( ; residual_sample < e1; residual_sample++) { __m128i mm_res = _mm_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample])); /* 0 0 0 |r0| == 00 |r0_64| */ mm_sum = _mm_add_epi64(mm_sum, mm_res); } for( ; residual_sample < e3; residual_sample+=2) { __m128i mm_res = _mm_abs_epi32(_mm_loadl_epi64((const __m128i*)(residual+residual_sample))); /* 0 0 |r1| |r0| */ mm_res = _mm_shuffle_epi32(mm_res, _MM_SHUFFLE(3,1,2,0)); /* 0 |r1| 0 |r0| == |r1_64| |r0_64| */ mm_sum = _mm_add_epi64(mm_sum, mm_res); } for( ; residual_sample < end; residual_sample++) { __m128i mm_res = _mm_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample])); mm_sum = _mm_add_epi64(mm_sum, mm_res); } mm_sum = _mm_add_epi64(mm_sum, _mm_srli_si128(mm_sum, 8)); _mm_storel_epi64((__m128i*)(abs_residual_partition_sums+partition), mm_sum); } } } /* now merge partitions for lower orders */ { uint32_t from_partition = 0, to_partition = partitions; int partition_order; for(partition_order = (int)max_partition_order - 1; partition_order >= (int)min_partition_order; partition_order--) { uint32_t i; partitions >>= 1; for(i = 0; i < partitions; i++) { abs_residual_partition_sums[to_partition++] = abs_residual_partition_sums[from_partition ] + abs_residual_partition_sums[from_partition+1]; from_partition += 2; } } } }
// Simple quantization static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16], int n, const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(2047); const __m128i zero = _mm_set1_epi16(0); __m128i sign0, sign8; __m128i coeff0, coeff8; __m128i out0, out8; __m128i packed_out; // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so that // we can use _mm_load_si128 instead of _mm_loadu_si128. __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]); const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]); const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]); const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]); const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]); const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]); const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]); const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]); const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]); const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]); // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative) sign0 = _mm_srai_epi16(in0, 15); sign8 = _mm_srai_epi16(in8, 15); // coeff = abs(in) = (in ^ sign) - sign coeff0 = _mm_xor_si128(in0, sign0); coeff8 = _mm_xor_si128(in8, sign8); coeff0 = _mm_sub_epi16(coeff0, sign0); coeff8 = _mm_sub_epi16(coeff8, sign8); // coeff = abs(in) + sharpen coeff0 = _mm_add_epi16(coeff0, sharpen0); coeff8 = _mm_add_epi16(coeff8, sharpen8); // if (coeff > 2047) coeff = 2047 coeff0 = _mm_min_epi16(coeff0, max_coeff_2047); coeff8 = _mm_min_epi16(coeff8, max_coeff_2047); // out = (coeff * iQ + B) >> QFIX; { // doing calculations with 32b precision (QFIX=17) // out = (coeff * iQ) __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); // expand bias from 16b to 32b __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero); __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero); __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero); __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero); // out = (coeff * iQ + B) out_00 = _mm_add_epi32(out_00, bias_00); out_04 = _mm_add_epi32(out_04, bias_04); out_08 = _mm_add_epi32(out_08, bias_08); out_12 = _mm_add_epi32(out_12, bias_12); // out = (coeff * iQ + B) >> QFIX; out_00 = _mm_srai_epi32(out_00, QFIX); out_04 = _mm_srai_epi32(out_04, QFIX); out_08 = _mm_srai_epi32(out_08, QFIX); out_12 = _mm_srai_epi32(out_12, QFIX); // pack result as 16b out0 = _mm_packs_epi32(out_00, out_04); out8 = _mm_packs_epi32(out_08, out_12); } // get sign back (if (sign[j]) out_n = -out_n) out0 = _mm_xor_si128(out0, sign0); out8 = _mm_xor_si128(out8, sign8); out0 = _mm_sub_epi16(out0, sign0); out8 = _mm_sub_epi16(out8, sign8); // in = out * Q in0 = _mm_mullo_epi16(out0, q0); in8 = _mm_mullo_epi16(out8, q8); // if (coeff <= mtx->zthresh_) {in=0; out=0;} { __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0); __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8); in0 = _mm_and_si128(in0, cmp0); in8 = _mm_and_si128(in8, cmp8); _mm_storeu_si128((__m128i*)&in[0], in0); _mm_storeu_si128((__m128i*)&in[8], in8); out0 = _mm_and_si128(out0, cmp0); out8 = _mm_and_si128(out8, cmp8); } // zigzag the output before storing it. // // The zigzag pattern can almost be reproduced with a small sequence of // shuffles. After it, we only need to swap the 7th (ending up in third // position instead of twelfth) and 8th values. { __m128i outZ0, outZ8; outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0)); outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0)); outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2)); outZ8 = _mm_shufflelo_epi16(out8, _MM_SHUFFLE(3, 0, 2, 1)); outZ8 = _mm_shuffle_epi32 (outZ8, _MM_SHUFFLE(3, 1, 2, 0)); outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0)); _mm_storeu_si128((__m128i*)&out[0], outZ0); _mm_storeu_si128((__m128i*)&out[8], outZ8); packed_out = _mm_packs_epi16(outZ0, outZ8); } { const int16_t outZ_12 = out[12]; const int16_t outZ_3 = out[3]; out[3] = outZ_12; out[12] = outZ_3; } // detect if all 'out' values are zeroes or not { int32_t tmp[4]; _mm_storeu_si128((__m128i*)tmp, packed_out); if (n) { tmp[0] &= ~0xff; } return (tmp[3] || tmp[2] || tmp[1] || tmp[0]); } }
static void FTransformSSE2(const uint8_t* src, const uint8_t* ref, int16_t* out) { const __m128i zero = _mm_setzero_si128(); const __m128i seven = _mm_set1_epi16(7); const __m128i k7500 = _mm_set1_epi32(7500); const __m128i k14500 = _mm_set1_epi32(14500); const __m128i k51000 = _mm_set1_epi32(51000); const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217); const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352); __m128i v01, v32; // Difference between src and ref and initial transpose. { // Load src and convert to 16b. const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]); const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]); const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]); const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]); const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); // Load ref and convert to 16b. const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]); const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]); const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]); const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]); const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); // Compute difference. const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); // Transpose. // 00 01 02 03 0 0 0 0 // 10 11 12 13 0 0 0 0 // 20 21 22 23 0 0 0 0 // 30 31 32 33 0 0 0 0 const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1); const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // a02 a12 a22 a32 a03 a13 a23 a33 // a00 a10 a20 a30 a01 a11 a21 a31 // a03 a13 a23 a33 a02 a12 a22 a32 } // First pass and subsequent transpose. { // Same operations are done on the (0,3) and (1,2) pairs. // b0 = (a0 + a3) << 3 // b1 = (a1 + a2) << 3 // b3 = (a0 - a3) << 3 // b2 = (a1 - a2) << 3 const __m128i a01 = _mm_add_epi16(v01, v32); const __m128i a32 = _mm_sub_epi16(v01, v32); const __m128i b01 = _mm_slli_epi16(a01, 3); const __m128i b32 = _mm_slli_epi16(a32, 3); const __m128i b11 = _mm_unpackhi_epi64(b01, b01); const __m128i b22 = _mm_unpackhi_epi64(b32, b32); // e0 = b0 + b1 // e2 = b0 - b1 const __m128i e0 = _mm_add_epi16(b01, b11); const __m128i e2 = _mm_sub_epi16(b01, b11); const __m128i e02 = _mm_unpacklo_epi64(e0, e2); // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12 // e3 = (b3 * 2217 - b2 * 5352 + 7500) >> 12 const __m128i b23 = _mm_unpacklo_epi16(b22, b32); const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); const __m128i d1 = _mm_add_epi32(c1, k14500); const __m128i d3 = _mm_add_epi32(c3, k7500); const __m128i e1 = _mm_srai_epi32(d1, 12); const __m128i e3 = _mm_srai_epi32(d3, 12); const __m128i e13 = _mm_packs_epi32(e1, e3); // Transpose. // 00 01 02 03 20 21 22 23 // 10 11 12 13 30 31 32 33 const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13); const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 02 12 22 32 03 13 23 33 // 00 10 20 30 01 11 21 31 // 03 13 23 33 02 12 22 32 } // Second pass { // Same operations are done on the (0,3) and (1,2) pairs. // a0 = v0 + v3 // a1 = v1 + v2 // a3 = v0 - v3 // a2 = v1 - v2 const __m128i a01 = _mm_add_epi16(v01, v32); const __m128i a32 = _mm_sub_epi16(v01, v32); const __m128i a11 = _mm_unpackhi_epi64(a01, a01); const __m128i a22 = _mm_unpackhi_epi64(a32, a32); // d0 = (a0 + a1 + 7) >> 4; // d2 = (a0 - a1 + 7) >> 4; const __m128i b0 = _mm_add_epi16(a01, a11); const __m128i b2 = _mm_sub_epi16(a01, a11); const __m128i c0 = _mm_add_epi16(b0, seven); const __m128i c2 = _mm_add_epi16(b2, seven); const __m128i d0 = _mm_srai_epi16(c0, 4); const __m128i d2 = _mm_srai_epi16(c2, 4); // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) const __m128i b23 = _mm_unpacklo_epi16(a22, a32); const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); const __m128i d3 = _mm_add_epi32(c3, k51000); const __m128i e1 = _mm_srai_epi32(d1, 16); const __m128i e3 = _mm_srai_epi32(d3, 16); const __m128i f1 = _mm_packs_epi32(e1, e1); const __m128i f3 = _mm_packs_epi32(e3, e3); // f1 = f1 + (a3 != 0); // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the // desired (0, 1), we add one earlier through k12000_plus_one. const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); _mm_storel_epi64((__m128i*)&out[ 0], d0); _mm_storel_epi64((__m128i*)&out[ 4], g1); _mm_storel_epi64((__m128i*)&out[ 8], d2); _mm_storel_epi64((__m128i*)&out[12], f3); } }