VECTOR ri; VECTOR ro; VECTOR iBoxMinX; VECTOR iBoxMinY; VECTOR iBoxMaxX; VECTOR iBoxMaxY; } anchor_info_t; static inline void __attribute__((__always_inline__,__gnu_inline__,__nonnull__,__artificial__)) multilaterate(anchor_info_t *anchors, size_t num_anchors, VECTOR L, VECTOR last_L, VECTOR minX, VECTOR maxX, VECTOR minY, VECTOR maxY, VECTOR *restrict resx, VECTOR *restrict resy) { // do iterative/recursive solution ivector_u maxScore; ivector_u maxScoreIndex = { _mm_set1_epi32(1) }; VECTOR finalX = zero; VECTOR finalY = zero; VECTOR iterMinX = minX; VECTOR iterMaxX = maxX; VECTOR iterMinY = minY; VECTOR iterMaxY = maxY; // Note: The vectorized implementation can result in more iterations for some // values. while (! VECTOR_TEST_ALL_ONES(VECTOR_LT(L, last_L))) { finalX = zero; finalY = zero; maxScore.v = _mm_set1_epi32(0); maxScoreIndex.v = _mm_set1_epi32(0);
void aom_filter_block1d8_v8_intrin_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i addFilterReg64, filtersReg, minReg; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; __m128i srcReg8; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits in the filter firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); // duplicate only the second 16 bits in the filter secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits in the filter thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); // duplicate only the forth 16 bits in the filter forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); // load the first 7 rows of 8 bytes srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); for (i = 0; i < output_height; i++) { // load the last 8 bytes srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); // merge the result together srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); // merge the result together srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); // add and saturate the results together minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); // shift by 7 bit each 16 bit srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); // shrink to 8 bit each 16 bits srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); src_ptr += src_pitch; // shift down a row srcReg1 = srcReg2; srcReg2 = srcReg3; srcReg3 = srcReg4; srcReg4 = srcReg5; srcReg5 = srcReg6; srcReg6 = srcReg7; srcReg7 = srcReg8; // save only 8 bytes convolve result _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); output_ptr += out_pitch; } }
int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D; __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m128 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; __m128 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; __m128 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30; __m128 velec,felec,velecsum,facel,crf,krf,krf2; real *charge; int nvdwtype; __m128 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m128 one_sixth = _mm_set1_ps(1.0/6.0); __m128 one_twelfth = _mm_set1_ps(1.0/12.0); __m128 dummy_mask,cutoff_mask; __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) ); __m128 one = _mm_set1_ps(1.0); __m128 two = _mm_set1_ps(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid; shiftvec = fr->shift_vec[0]; fshift = fr->fshift[0]; facel = _mm_set1_ps(fr->epsfac); charge = mdatoms->chargeA;
int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD; int *iinr,*jindex,*jjnr,*shiftidx,*gid; real rcutoff_scalar; real *shiftvec,*fshift,*x,*f; real *fjptrA,*fjptrB,*fjptrC,*fjptrD; real scratch[4*DIM]; __m128 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall; int vdwioffset0; __m128 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0; int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D; __m128 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m128 velec,felec,velecsum,facel,crf,krf,krf2; real *charge; __m128i vfitab; __m128i ifour = _mm_set1_epi32(4); __m128 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; real *vftab; __m128 dummy_mask,cutoff_mask; __m128 signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) ); __m128 one = _mm_set1_ps(1.0); __m128 two = _mm_set1_ps(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid;
void aom_filter_block1d4_h8_intrin_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i firstFilters, secondFilters, shuffle1, shuffle2; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; __m128i addFilterReg64, filtersReg, srcReg, minReg; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits in the filter into the first lane firstFilters = _mm_shufflelo_epi16(filtersReg, 0); // duplicate only the third 16 bit in the filter into the first lane secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); // duplicate only the seconds 16 bits in the filter into the second lane // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); // duplicate only the forth 16 bits in the filter into the second lane // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); // loading the local filters shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8); shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); for (i = 0; i < output_height; i++) { srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1); srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); // extract the higher half of the lane srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); // add and saturate all the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); // shift by 7 bit each 16 bits srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); // shrink to 8 bit each 16 bits srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); src_ptr += src_pixels_per_line; // save only 4 bytes *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1); output_ptr += output_pitch; } }
static inline double calc_output_single (SINC_FILTER *filter, const increment_t increment, const increment_t start_filter_index) { #ifdef RESAMPLER_SSE_OPT __m128i increment4; __m128 left128,right128; float left,right; #else double left,right; #endif const coeff_t * const __restrict coeffs = filter->coeffs; const float * const __restrict buffer = filter->buffer; increment_t filter_index, max_filter_index ; int data_index, coeff_count; /* Convert input parameters into fixed point. */ max_filter_index = int_to_fp (filter->coeff_half_len) ; /* First apply the left half of the filter. */ filter_index = start_filter_index ; coeff_count = (max_filter_index - filter_index) / increment ; filter_index = filter_index + coeff_count * increment ; data_index = filter->b_current - coeff_count ; #ifdef RESAMPLER_SSE_OPT increment4 = _mm_set_epi32(increment * 3, increment * 2, increment, 0); left128 = _mm_setzero_ps(); while(filter_index >= increment * 3) { #ifdef USE_WINDOWS_CODE __m128i indx = _mm_sub_epi32(_mm_set1_epi32(filter_index), increment4); __m128i fractioni = _mm_and_si128(indx,_mm_set1_epi32(((((increment_t)1) << SHIFT_BITS) - 1))); #else Windows__m128i indx; indx.m128i = _mm_sub_epi32(_mm_set1_epi32(filter_index), increment4); __m128i fractioni = _mm_and_si128(indx.m128i,_mm_set1_epi32(((((increment_t)1) << SHIFT_BITS) - 1))); #endif __m128 icoeff0, icoeff2; // warning that these are uninitialized is okay and its intended, as both high and low 64bit-parts are set below __m128 icoeff,icoeffp1,icoeffd,fraction; #ifdef _DEBUG icoeff0 = icoeff2 = _mm_setzero_ps(); #endif #ifdef USE_WINDOWS_CODE indx = _mm_srai_epi32(indx, SHIFT_BITS); #else indx.m128i = _mm_srai_epi32(indx.m128i, SHIFT_BITS); #endif icoeff0 = _mm_loadh_pi(_mm_loadl_pi(icoeff0, (__m64*)(coeffs + indx.m128i_i32[0])), (__m64*)(coeffs + indx.m128i_i32[1])); icoeff2 = _mm_loadh_pi(_mm_loadl_pi(icoeff2, (__m64*)(coeffs + indx.m128i_i32[2])), (__m64*)(coeffs + indx.m128i_i32[3])); icoeff = _mm_shuffle_ps(icoeff0, icoeff2, _MM_SHUFFLE(2, 0, 2, 0)); icoeffp1 = _mm_shuffle_ps(icoeff0, icoeff2, _MM_SHUFFLE(3, 1, 3, 1)); icoeffd = _mm_sub_ps(icoeffp1, icoeff); fraction = _mm_mul_ps(_mm_cvtepi32_ps(fractioni), _mm_set1_ps((float)INV_FP_ONE)); icoeff = _mm_add_ps(icoeff,_mm_mul_ps(icoeffd, fraction)); left128 = _mm_add_ps(left128,_mm_mul_ps(icoeff, _mm_loadu_ps(buffer + data_index))); data_index += 4; filter_index -= increment * 4; } #endif left = 0.; while (filter_index >= MAKE_INCREMENT_T(0)) { coeff_t fraction = fp_to_float(filter_index); int indx = fp_to_int(filter_index); coeff_t icoeff = coeffs[indx] + fraction * (coeffs[indx + 1] - coeffs[indx]); left += icoeff * buffer[data_index]; filter_index -= increment; data_index++; } /* Now apply the right half of the filter. */ filter_index = increment - start_filter_index ; coeff_count = (max_filter_index - filter_index) / increment ; filter_index = filter_index + coeff_count * increment ; data_index = filter->b_current + 1 + coeff_count ; #ifdef RESAMPLER_SSE_OPT right128 = _mm_setzero_ps(); while (filter_index > increment * 3) { #ifdef USE_WINDOWS_CODE __m128i indx = _mm_sub_epi32(_mm_set1_epi32(filter_index), increment4); __m128i fractioni = _mm_and_si128(indx, _mm_set1_epi32(((((increment_t)1) << SHIFT_BITS) - 1))); #else Windows__m128i indx; indx.m128i = _mm_sub_epi32(_mm_set1_epi32(filter_index), increment4); __m128i fractioni = _mm_and_si128(indx.m128i, _mm_set1_epi32(((((increment_t)1) << SHIFT_BITS) - 1))); #endif __m128 icoeff0, icoeff2; // warning that these are uninitialized is okay and its intended, as both high and low 64bit-parts are set below __m128 icoeff,icoeffp1,icoeffd,fraction,data; #ifdef _DEBUG icoeff0 = icoeff2 = _mm_setzero_ps(); #endif #ifdef USE_WINDOWS_CODE indx = _mm_srai_epi32(indx, SHIFT_BITS); #else indx.m128i = _mm_srai_epi32(indx.m128i, SHIFT_BITS); #endif icoeff0 = _mm_loadh_pi(_mm_loadl_pi(icoeff0, (__m64*)(coeffs + indx.m128i_i32[0])), (__m64*)(coeffs + indx.m128i_i32[1])); icoeff2 = _mm_loadh_pi(_mm_loadl_pi(icoeff2, (__m64*)(coeffs + indx.m128i_i32[2])), (__m64*)(coeffs + indx.m128i_i32[3])); icoeff = _mm_shuffle_ps(icoeff0, icoeff2, _MM_SHUFFLE(2, 0, 2, 0)); icoeffp1 = _mm_shuffle_ps(icoeff0, icoeff2, _MM_SHUFFLE(3, 1, 3, 1)); icoeffd = _mm_sub_ps(icoeffp1, icoeff); fraction = _mm_mul_ps(_mm_cvtepi32_ps(fractioni), _mm_set1_ps((float)INV_FP_ONE)); icoeff = _mm_add_ps(icoeff, _mm_mul_ps(icoeffd, fraction)); data = _mm_loadu_ps(buffer + (data_index - 3)); right128 = _mm_add_ps(right128,_mm_mul_ps(icoeff, _mm_shuffle_ps(data,data,_MM_SHUFFLE(0,1,2,3)))); data_index -= 4; filter_index -= increment * 4; } #endif right = 0.; while (filter_index > MAKE_INCREMENT_T(0)) { coeff_t fraction = fp_to_float(filter_index); int indx = fp_to_int(filter_index); coeff_t icoeff = coeffs[indx] + fraction * (coeffs[indx + 1] - coeffs[indx]); right += icoeff * buffer[data_index]; filter_index -= increment; data_index--; } return ( #ifdef RESAMPLER_SSE_OPT _mm_cvtss_f32(horizontal_add(left128)) + _mm_cvtss_f32(horizontal_add(right128)) + #endif left + right) ; } /* calc_output_single */
OD_SIMD_INLINE od_m256i od_mm256_set1_epi32(int c) { od_m256i r; r.lo = _mm_set1_epi32(c); r.hi = _mm_set1_epi32(c); return r; }
DBL AVXFMA4Noise(const Vector3d& EPoint, int noise_generator) { DBL x, y, z; DBL *mp; int ix, iy, iz; int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash; DBL sum; // TODO FIXME - global statistics reference // Stats[Calls_To_Noise]++; if (noise_generator==kNoiseGen_Perlin) { // The 1.59 and 0.985 are to correct for some biasing problems with // the random # generator used to create the noise tables. Final // range of values is about 5.0e-4 below 0.0 and above 1.0. Mean // value is 0.49 (ideally it would be 0.5). sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985); // Clamp final value to 0-1 range if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; return sum; } x = EPoint[X]; y = EPoint[Y]; z = EPoint[Z]; /* its equivalent integer lattice point. */ /* ix = (int)x; iy = (int)y; iz = (long)z; */ /* JB fix for the range problem */ __m128d xy = _mm_setr_pd(x, y); __m128d zn = _mm_set_sd(z); __m128d epsy = _mm_set1_pd(1.0 - EPSILON); __m128d xy_e = _mm_sub_pd(xy, epsy); __m128d zn_e = _mm_sub_sd(zn, epsy); __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy)); __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn)); __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0); __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ); __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy)); __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn)); const __m128i fff = _mm_set1_epi32(0xfff); __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff); __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff); ix = _mm_extract_epi32(i_xy, 0); iy = _mm_extract_epi32(i_xy, 1); iz = _mm_extract_epi32(i_zn, 0); ixiy_hash = Hash2d(ix, iy); jxiy_hash = Hash2d(ix + 1, iy); ixjy_hash = Hash2d(ix, iy + 1); jxjy_hash = Hash2d(ix + 1, iy + 1); mp = &RTable[Hash1dRTableIndex(ixiy_hash, iz)]; DBL *mp2 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)]; DBL *mp3 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)]; DBL *mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)]; DBL *mp5 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)]; DBL *mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)]; DBL *mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)]; DBL *mp8 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)]; const __m128d three = _mm_set1_pd(3.0); const __m128d two = _mm_set1_pd(2.0); const __m128d one = _mm_set1_pd(1.0); __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy); __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy); __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn); __m128d jx_mm = _mm_sub_pd(ix_mm, one); __m128d jy_mm = _mm_sub_pd(iy_mm, one); __m128d jz_mm = _mm_sub_pd(iz_mm, one); __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three)); __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three)); __m128d mm_tz = _mm_sub_pd(one, mm_sz); __m128d mm_txy = _mm_sub_pd(one, mm_sxy); __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy); __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy); __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy); __m128d y_mm = _mm_unpacklo_pd(iy_mm, jy_mm); __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p, s_mm; __m128d int_sum1 = _mm_setzero_pd(); s_mm = _mm_mul_pd(mm_txty_txsy, mm_tz); INCRSUMP2(mp, mp2, s_mm, ix_mm, y_mm, iz_mm, int_sum1); s_mm = _mm_mul_pd(mm_txty_txsy, mm_sz); INCRSUMP2(mp3, mp4, s_mm, ix_mm, y_mm, jz_mm, int_sum1); s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_tz); INCRSUMP2(mp5, mp6, s_mm, jx_mm, y_mm, iz_mm, int_sum1); s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_sz); INCRSUMP2(mp7, mp8, s_mm, jx_mm, y_mm, jz_mm, int_sum1); int_sum1 = _mm_hadd_pd(int_sum1, int_sum1); if(noise_generator==kNoiseGen_RangeCorrected) { /* details of range here: Min, max: -1.05242, 0.988997 Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828 We want to change it to as close to [0,1] as possible. */ const __m128d r2 = _mm_set_sd(0.48985582); const __m128d r1r2 = _mm_set_sd(1.05242*0.48985582); int_sum1 = _mm_macc_sd(int_sum1, r2, r1r2); } else { int_sum1 = _mm_add_sd(int_sum1, _mm_set_sd(0.5)); } int_sum1 = _mm_min_sd(one, int_sum1); int_sum1 = _mm_max_sd(_mm_setzero_pd(), int_sum1); _mm_store_sd(&sum, int_sum1); return (sum); }
void AVXFMA4DNoise(Vector3d& result, const Vector3d& EPoint) { DBL x, y, z; int ix, iy, iz; int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash; // TODO FIXME - global statistics reference // Stats[Calls_To_DNoise]++; x = EPoint[X]; y = EPoint[Y]; z = EPoint[Z]; /* its equivalent integer lattice point. */ /*ix = (int)x; iy = (int)y; iz = (int)z; x_ix = x - ix; y_iy = y - iy; z_iz = z - iz;*/ /* JB fix for the range problem */ __m128d xy = _mm_setr_pd(x, y); __m128d zn = _mm_set_sd(z); __m128d epsy = _mm_set1_pd(1.0 - EPSILON); __m128d xy_e = _mm_sub_pd(xy, epsy); __m128d zn_e = _mm_sub_sd(zn, epsy); __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy)); __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn)); __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0); __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ); __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy)); __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn)); const __m128i fff = _mm_set1_epi32(0xfff); __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff); __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff); ix = _mm_extract_epi32(i_xy, 0); iy = _mm_extract_epi32(i_xy, 1); iz = _mm_extract_epi32(i_zn, 0); ixiy_hash = Hash2d(ix, iy); jxiy_hash = Hash2d(ix + 1, iy); ixjy_hash = Hash2d(ix, iy + 1); jxjy_hash = Hash2d(ix + 1, iy + 1); DBL* mp1 = &RTable[Hash1dRTableIndex(ixiy_hash, iz)]; DBL* mp2 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)]; DBL* mp3 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)]; DBL* mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)]; DBL* mp5 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)]; DBL* mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)]; DBL* mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)]; DBL* mp8 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)]; const __m128d three = _mm_set1_pd(3.0); const __m128d two = _mm_set1_pd(2.0); const __m128d one = _mm_set1_pd(1.0); __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy); __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy); __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn); __m128d jx_mm = _mm_sub_pd(ix_mm, one); __m128d jy_mm = _mm_sub_pd(iy_mm, one); __m128d jz_mm = _mm_sub_pd(iz_mm, one); __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three)); __m128d mm_tz = _mm_sub_pd(one, mm_sz); __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three)); __m128d mm_txy = _mm_sub_pd(one, mm_sxy); __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy); __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy); __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy); __m128d mm_txty_txsy_tz = _mm_mul_pd(mm_txty_txsy, mm_tz); __m128d mm_txty_txsy_sz = _mm_mul_pd(mm_txty_txsy, mm_sz); __m128d mm_sxty_sxsy_tz = _mm_mul_pd(mm_sxty_sxsy, mm_tz); __m128d mm_sxty_sxsy_sz = _mm_mul_pd(mm_sxty_sxsy, mm_sz); __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p; __m128d sum_X_Y = _mm_setzero_pd(); __m128d sum__Z = _mm_setzero_pd(); __m128d mm_s1 = _mm_unpacklo_pd(mm_txty_txsy_tz, mm_txty_txsy_tz); INCRSUMP2(mp1, mp1 + 8, mm_s1, ix_mm, iy_mm, iz_mm, sum_X_Y); __m128d mm_s2 = _mm_unpacklo_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz); INCRSUMP2(mp2, mp2 + 8, mm_s2, jx_mm, iy_mm, iz_mm, sum_X_Y); __m128d mm_s3 = _mm_unpackhi_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz); INCRSUMP2(mp3, mp3 + 8, mm_s3, jx_mm, jy_mm, iz_mm, sum_X_Y); __m128d mm_s4 = _mm_unpackhi_pd(mm_txty_txsy_tz, mm_txty_txsy_tz); INCRSUMP2(mp4, mp4 + 8, mm_s4, ix_mm, jy_mm, iz_mm, sum_X_Y); __m128d mm_s5 = _mm_unpackhi_pd(mm_txty_txsy_sz, mm_txty_txsy_sz); INCRSUMP2(mp5, mp5 + 8, mm_s5, ix_mm, jy_mm, jz_mm, sum_X_Y); __m128d mm_s6 = _mm_unpackhi_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz); INCRSUMP2(mp6, mp6 + 8, mm_s6, jx_mm, jy_mm, jz_mm, sum_X_Y); __m128d mm_s7 = _mm_unpacklo_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz); INCRSUMP2(mp7, mp7 + 8, mm_s7, jx_mm, iy_mm, jz_mm, sum_X_Y); __m128d mm_s8 = _mm_unpacklo_pd(mm_txty_txsy_sz, mm_txty_txsy_sz); INCRSUMP2(mp8, mp8 + 8, mm_s8, ix_mm, iy_mm, jz_mm, sum_X_Y); __m128d iy_jy = _mm_unpacklo_pd(iy_mm, jy_mm); INCRSUMP2(mp1 + 16, mp4 + 16, mm_txty_txsy_tz, ix_mm, iy_jy, iz_mm, sum__Z); INCRSUMP2(mp8 + 16, mp5 + 16, mm_txty_txsy_sz, ix_mm, iy_jy, jz_mm, sum__Z); INCRSUMP2(mp2 + 16, mp3 + 16, mm_sxty_sxsy_tz, jx_mm, iy_jy, iz_mm, sum__Z); INCRSUMP2(mp7 + 16, mp6 + 16, mm_sxty_sxsy_sz, jx_mm, iy_jy, jz_mm, sum__Z); sum__Z = _mm_hadd_pd(sum__Z, sum__Z); _mm_storeu_pd(*result, sum_X_Y); _mm_store_sd(&result[Z], sum__Z); }
/* Function: esl_sse_expf() * Synopsis: <r[z] = exp x[z]> * Incept: SRE, Fri Dec 14 14:46:27 2007 [Janelia] * * Purpose: Given a vector <x> containing four floats, returns a * vector <r> in which each element <r[z] = expf(x[z])>. * * Valid for all IEEE754 floats $x_z$. * * Xref: J2/71 * J10/62: bugfix, minlogf/maxlogf range was too wide; * (k+127) must be >=0 and <=255, so (k+127)<<23 * is a valid IEEE754 float, without touching * the sign bit. Pommier had this right in the * first place, and I didn't understand. * * Note: Derived from an SSE1 implementation by Julian * Pommier. Converted to SSE2. * * Note on maxlogf/minlogf, which are close to but not * exactly 127.5/log2 [J10/63]. We need -127<=k<=128, so * k+127 is 0..255, a valid IEEE754 8-bit exponent * (0..255), so the bit pattern (k+127)<<23 is IEEE754 * single-precision for 2^k. If k=-127, we get IEEE754 0. * If k=128, we get IEEE754 +inf. If k<-127, k+127 is * negative and we get screwed up. If k>128, k+127 * overflows the 8-bit exponent and sets the sign bit. So * for x' (base 2) < -127.5 we must definitely return e^x ~ * 0; for x' < 126.5 we're going to calculate 0 anyway * (because k=floor(-126.5-epsilon+0.5) = -127). So any * minlogf between -126.5 log2 ... -127.5 log2 will suffice * as the cutoff. Ditto for 126.5 log2 .. 127.5log2. * That's 87.68312 .. 88.3762655. I think Pommier's * thinking is, you don't want to get to close to the * edges, lest fp roundoff error screw you (he may have * consider 1 ulp carefully, I can't tell), but otherwise * you may as well put your bounds close to the outer edge; * so * maxlogf = 127.5 log(2) - epsilon * minlogf = -127.5 log(2) + epsilon * for an epsilon that happen to be ~ 3e-6. */ __m128 esl_sse_expf(__m128 x) { static float cephes_p[6] = { 1.9875691500E-4f, 1.3981999507E-3f, 8.3334519073E-3f, 4.1665795894E-2f, 1.6666665459E-1f, 5.0000001201E-1f }; static float cephes_c[2] = { 0.693359375f, -2.12194440e-4f }; static float maxlogf = 88.3762626647949f; /* 127.5 log(2) - epsilon. above this, 0.5+x/log2 gives k>128 and breaks 2^k "float" construction, because (k+127)<<23 must be a valid IEEE754 exponent 0..255 */ static float minlogf = -88.3762626647949f; /*-127.5 log(2) + epsilon. below this, 0.5+x/log2 gives k<-127 and breaks 2^k, see above */ __m128i k; __m128 mask, tmp, fx, z, y, minmask, maxmask; /* handle out-of-range and special conditions */ maxmask = _mm_cmpgt_ps(x, _mm_set1_ps(maxlogf)); minmask = _mm_cmple_ps(x, _mm_set1_ps(minlogf)); /* range reduction: exp(x) = 2^k e^f = exp(f + k log 2); k = floorf(0.5 + x / log2): */ fx = _mm_mul_ps(x, _mm_set1_ps(eslCONST_LOG2R)); fx = _mm_add_ps(fx, _mm_set1_ps(0.5f)); /* floorf() with SSE: */ k = _mm_cvttps_epi32(fx); /* cast to int with truncation */ tmp = _mm_cvtepi32_ps(k); /* cast back to float */ mask = _mm_cmpgt_ps(tmp, fx); /* if it increased (i.e. if it was negative...) */ mask = _mm_and_ps(mask, _mm_set1_ps(1.0f)); /* ...without a conditional branch... */ fx = _mm_sub_ps(tmp, mask); /* then subtract one. */ k = _mm_cvttps_epi32(fx); /* k is now ready for the 2^k part. */ /* polynomial approx for e^f for f in range [-0.5, 0.5] */ tmp = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[0])); z = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[1])); x = _mm_sub_ps(x, tmp); x = _mm_sub_ps(x, z); z = _mm_mul_ps(x, x); y = _mm_set1_ps(cephes_p[0]); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[1])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[2])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[3])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[4])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[5])); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(1.0f)); /* build 2^k by hand, by creating a IEEE754 float */ k = _mm_add_epi32(k, _mm_set1_epi32(127)); k = _mm_slli_epi32(k, 23); fx = _mm_castsi128_ps(k); /* put 2^k e^f together (fx = 2^k, y = e^f) and we're done */ y = _mm_mul_ps(y, fx); /* special/range cleanup */ y = esl_sse_select_ps(y, _mm_set1_ps(eslINFINITY), maxmask); /* exp(x) = inf for x > log(2^128) */ y = esl_sse_select_ps(y, _mm_set1_ps(0.0f), minmask); /* exp(x) = 0 for x < log(2^-149) */ return y; }
void tuned_ConvertRGBToULY4(uint8_t *pYBegin, uint8_t *pUBegin, uint8_t *pVBegin, const uint8_t *pSrcBegin, const uint8_t *pSrcEnd, size_t cbWidth, ssize_t scbStride) { const int shift = 14; __m128i rb2y, xg2y, rb2u, xg2u, rb2v, xg2v; if (std::is_same<T, CBGRAColorOrder>::value || std::is_same<T, CBGRColorOrder>::value) { rb2y = _mm_set2_epi16_shift(C::R2Y, C::B2Y, shift); xg2y = _mm_set2_epi16_shift(16.5 / 0xff, C::G2Y, shift); rb2u = _mm_set2_epi16_shift(C::R2U, C::B2U, shift); xg2u = _mm_set2_epi16_shift(128.5 / 0xff, C::G2U, shift); rb2v = _mm_set2_epi16_shift(C::R2V, C::B2V, shift); xg2v = _mm_set2_epi16_shift(128.5 / 0xff, C::G2V, shift); } else { rb2y = _mm_set2_epi16_shift(C::B2Y, C::R2Y, shift); xg2y = _mm_set2_epi16_shift(C::G2Y, 16.5 / 0xff, shift); rb2u = _mm_set2_epi16_shift(C::B2U, C::R2U, shift); xg2u = _mm_set2_epi16_shift(C::G2U, 128.5 / 0xff, shift); rb2v = _mm_set2_epi16_shift(C::B2V, C::R2V, shift); xg2v = _mm_set2_epi16_shift(C::G2V, 128.5 / 0xff, shift); } auto y = pYBegin; auto u = pUBegin; auto v = pVBegin; for (auto p = pSrcBegin; p != pSrcEnd; p += scbStride) { auto pp = p; for (; pp <= p + cbWidth - 16; pp += T::BYPP*4) { __m128i m = _mm_loadu_si128((const __m128i *)pp); __m128i rb, xg; if (std::is_same<T, CBGRAColorOrder>::value) { // m = XX R3 G3 B3 XX R2 G2 B2 XX R1 G1 B1 XX R0 G0 B0 rb = _mm_and_si128(m, _mm_set1_epi16(0x00ff)); // 00 R3 00 B3 00 R2 00 B2 00 R1 00 B1 00 R0 00 B0 xg = _mm_or_si128(_mm_srli_epi16(m, 8), _mm_set1_epi32(0x00ff0000)); // 00 ff 00 G3 00 ff 00 G2 00 ff 00 G1 00 ff 00 G0 } #ifdef __SSSE3__ else if (std::is_same<T, CBGRColorOrder>::value) { // m = XX XX XX XX R3 G3 B3 R2 G2 B2 R1 G1 B1 R0 G0 B0 rb = _mm_shuffle_epi8(m, _mm_set_epi8(-1, 11, -1, 9, -1, 8, -1, 6, -1, 5, -1, 3, -1, 2, -1, 0)); // 00 R3 00 B3 00 R2 00 B2 00 R1 00 B1 00 R0 00 B0 xg = _mm_or_si128(_mm_shuffle_epi8(m, _mm_set_epi8(-1, -1, -1, 10, -1, -1, -1, 7, -1, -1, -1, 4, -1, -1, -1, 1)), _mm_set1_epi32(0x00ff0000)); // 00 ff 00 G3 00 ff 00 G2 00 ff 00 G1 00 ff 00 G0 } #endif else if (std::is_same<T, CARGBColorOrder>::value) { // m = B3 G3 R3 XX B2 G2 R2 XX B1 G1 R1 XX B0 G0 R0 XX rb = _mm_srli_epi16(m, 8); // 00 B3 00 R3 00 B2 00 R2 00 B1 00 R1 00 B0 00 R0 xg = _mm_or_si128(_mm_and_si128(m, _mm_set1_epi32(0x00ff0000)), _mm_set1_epi32(0x000000ff)); // 00 G3 00 ff 00 G2 00 ff 00 G1 00 ff 00 G0 00 ff } #ifdef __SSSE3__ else if (std::is_same<T, CRGBColorOrder>::value) { // m = XX XX XX XX B3 G3 R3 B2 G2 R2 B1 G1 R1 B0 G0 R0 rb = _mm_shuffle_epi8(m, _mm_set_epi8(-1, 11, -1, 9, -1, 8, -1, 6, -1, 5, -1, 3, -1, 2, -1, 0)); // 00 B3 00 R3 00 B2 00 R2 00 B1 00 R1 00 B0 00 R0 xg = _mm_or_si128(_mm_shuffle_epi8(m, _mm_set_epi8(-1, 10, -1, -1, -1, 7, -1, -1, -1, 4, -1, -1, -1, 1, -1, -1)), _mm_set1_epi32(0x000000ff)); // 00 G3 00 ff 00 G2 00 ff 00 G1 00 ff 00 G0 00 ff } #endif auto xrgb2yuv = [rb, xg, shift](__m128i rb2yuv, __m128i xg2yuv) -> uint32_t { __m128i yuv = _mm_add_epi32(_mm_madd_epi16(rb, rb2yuv), _mm_madd_epi16(xg, xg2yuv)); yuv = _mm_srli_epi32(yuv, shift); #ifdef __SSSE3__ if (F >= CODEFEATURE_SSSE3) { yuv = _mm_shuffle_epi8(yuv, _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0)); } else #endif { yuv = _mm_packs_epi32(yuv, yuv); yuv = _mm_packus_epi16(yuv, yuv); } return _mm_cvtsi128_si32(yuv); }; *(uint32_t *)y = xrgb2yuv(rb2y, xg2y); *(uint32_t *)u = xrgb2yuv(rb2u, xg2u); *(uint32_t *)v = xrgb2yuv(rb2v, xg2v); y += 4; u += 4; v += 4; } for (; pp < p + cbWidth; pp += T::BYPP) { __m128i m; __m128i rb, xg; if (std::is_same<T, CBGRAColorOrder>::value || std::is_same<T, CBGRColorOrder>::value) { if (std::is_same<T, CBGRAColorOrder>::value) { m = _mm_cvtsi32_si128(*(const uint32_t *)pp); // m = XX XX XX XX XX XX XX XX XX XX XX XX XX R0 G0 B0 } else { m = _mm_cvtsi32_si128(*(const uint32_t *)(pp - 1)); // m = XX XX XX XX XX XX XX XX XX XX XX XX R0 G0 B0 XX m = _mm_srli_epi32(m, 8); } rb = _mm_and_si128(m, _mm_set1_epi16(0x00ff)); // 00 XX 00 XX 00 XX 00 XX 00 XX 00 XX 00 R0 00 B0 xg = _mm_or_si128(_mm_srli_epi16(m, 8), _mm_set1_epi32(0x00ff0000)); // 00 ff 00 XX 00 ff 00 XX 00 ff 00 XX 00 ff 00 G0 } else if (std::is_same<T, CARGBColorOrder>::value || std::is_same<T, CRGBColorOrder>::value) { if (std::is_same<T, CARGBColorOrder>::value) { m = _mm_cvtsi32_si128(*(const uint32_t *)pp); // m = XX XX XX XX XX XX XX XX XX XX XX XX B0 G0 R0 XX } else { m = _mm_cvtsi32_si128(*(const uint32_t *)(pp - 1)); // m = XX XX XX XX XX XX XX XX XX XX XX XX B0 G0 R0 XX } rb = _mm_srli_epi16(m, 8); // 00 XX 00 XX 00 XX 00 XX 00 XX 00 XX 00 B0 00 R0 xg = _mm_or_si128(_mm_and_si128(m, _mm_set1_epi32(0x00ff0000)), _mm_set1_epi32(0x000000ff)); // 00 XX 00 ff 00 XX 00 ff 00 XX 00 ff 00 G0 00 ff } auto xrgb2yuv = [rb, xg, shift](__m128i rb2yuv, __m128i xg2yuv) -> uint8_t { __m128i yuv = _mm_add_epi32(_mm_madd_epi16(rb, rb2yuv), _mm_madd_epi16(xg, xg2yuv)); yuv = _mm_srli_epi32(yuv, shift); return (uint8_t)_mm_cvtsi128_si32(yuv); }; *y = xrgb2yuv(rb2y, xg2y); *u = xrgb2yuv(rb2u, xg2u); *v = xrgb2yuv(rb2v, xg2v); y++; u++; v++; } } }
void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, // as the first pass results are transposed, we tranpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). const int stride = pitch >> 1; int pass; // We need an intermediate buffer between passes. int16_t intermediate[256]; int16_t *in = input; int16_t *out = intermediate; // Constants // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i kOne = _mm_set1_epi16(1); // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { // We process eight columns (transposed rows in second pass) at a time. int column_start; for (column_start = 0; column_start < 16; column_start += 8) { __m128i in00, in01, in02, in03, in04, in05, in06, in07; __m128i in08, in09, in10, in11, in12, in13, in14, in15; __m128i input0, input1, input2, input3, input4, input5, input6, input7; __m128i step1_0, step1_1, step1_2, step1_3; __m128i step1_4, step1_5, step1_6, step1_7; __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; __m128i step3_0, step3_1, step3_2, step3_3; __m128i step3_4, step3_5, step3_6, step3_7; __m128i res00, res01, res02, res03, res04, res05, res06, res07; __m128i res08, res09, res10, res11, res12, res13, res14, res15; // Load and pre-condition input. if (0 == pass) { in00 = _mm_loadu_si128((const __m128i *)(in + 0 * stride)); in01 = _mm_loadu_si128((const __m128i *)(in + 1 * stride)); in02 = _mm_loadu_si128((const __m128i *)(in + 2 * stride)); in03 = _mm_loadu_si128((const __m128i *)(in + 3 * stride)); in04 = _mm_loadu_si128((const __m128i *)(in + 4 * stride)); in05 = _mm_loadu_si128((const __m128i *)(in + 5 * stride)); in06 = _mm_loadu_si128((const __m128i *)(in + 6 * stride)); in07 = _mm_loadu_si128((const __m128i *)(in + 7 * stride)); in08 = _mm_loadu_si128((const __m128i *)(in + 8 * stride)); in09 = _mm_loadu_si128((const __m128i *)(in + 9 * stride)); in10 = _mm_loadu_si128((const __m128i *)(in + 10 * stride)); in11 = _mm_loadu_si128((const __m128i *)(in + 11 * stride)); in12 = _mm_loadu_si128((const __m128i *)(in + 12 * stride)); in13 = _mm_loadu_si128((const __m128i *)(in + 13 * stride)); in14 = _mm_loadu_si128((const __m128i *)(in + 14 * stride)); in15 = _mm_loadu_si128((const __m128i *)(in + 15 * stride)); // x = x << 2 in00 = _mm_slli_epi16(in00, 2); in01 = _mm_slli_epi16(in01, 2); in02 = _mm_slli_epi16(in02, 2); in03 = _mm_slli_epi16(in03, 2); in04 = _mm_slli_epi16(in04, 2); in05 = _mm_slli_epi16(in05, 2); in06 = _mm_slli_epi16(in06, 2); in07 = _mm_slli_epi16(in07, 2); in08 = _mm_slli_epi16(in08, 2); in09 = _mm_slli_epi16(in09, 2); in10 = _mm_slli_epi16(in10, 2); in11 = _mm_slli_epi16(in11, 2); in12 = _mm_slli_epi16(in12, 2); in13 = _mm_slli_epi16(in13, 2); in14 = _mm_slli_epi16(in14, 2); in15 = _mm_slli_epi16(in15, 2); } else { in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 16)); in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 16)); in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 16)); in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 16)); in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 16)); in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 16)); in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 16)); in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 16)); in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 16)); in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 16)); in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 16)); in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 16)); in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 16)); in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 16)); in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 16)); in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 16)); // x = (x + 1) >> 2 in00 = _mm_add_epi16(in00, kOne); in01 = _mm_add_epi16(in01, kOne); in02 = _mm_add_epi16(in02, kOne); in03 = _mm_add_epi16(in03, kOne); in04 = _mm_add_epi16(in04, kOne); in05 = _mm_add_epi16(in05, kOne); in06 = _mm_add_epi16(in06, kOne); in07 = _mm_add_epi16(in07, kOne); in08 = _mm_add_epi16(in08, kOne); in09 = _mm_add_epi16(in09, kOne); in10 = _mm_add_epi16(in10, kOne); in11 = _mm_add_epi16(in11, kOne); in12 = _mm_add_epi16(in12, kOne); in13 = _mm_add_epi16(in13, kOne); in14 = _mm_add_epi16(in14, kOne); in15 = _mm_add_epi16(in15, kOne); in00 = _mm_srai_epi16(in00, 2); in01 = _mm_srai_epi16(in01, 2); in02 = _mm_srai_epi16(in02, 2); in03 = _mm_srai_epi16(in03, 2); in04 = _mm_srai_epi16(in04, 2); in05 = _mm_srai_epi16(in05, 2); in06 = _mm_srai_epi16(in06, 2); in07 = _mm_srai_epi16(in07, 2); in08 = _mm_srai_epi16(in08, 2); in09 = _mm_srai_epi16(in09, 2); in10 = _mm_srai_epi16(in10, 2); in11 = _mm_srai_epi16(in11, 2); in12 = _mm_srai_epi16(in12, 2); in13 = _mm_srai_epi16(in13, 2); in14 = _mm_srai_epi16(in14, 2); in15 = _mm_srai_epi16(in15, 2); } in += 8; // Calculate input for the first 8 results. { input0 = _mm_add_epi16(in00, in15); input1 = _mm_add_epi16(in01, in14); input2 = _mm_add_epi16(in02, in13); input3 = _mm_add_epi16(in03, in12); input4 = _mm_add_epi16(in04, in11); input5 = _mm_add_epi16(in05, in10); input6 = _mm_add_epi16(in06, in09); input7 = _mm_add_epi16(in07, in08); } // Calculate input for the next 8 results. { step1_0 = _mm_sub_epi16(in07, in08); step1_1 = _mm_sub_epi16(in06, in09); step1_2 = _mm_sub_epi16(in05, in10); step1_3 = _mm_sub_epi16(in04, in11); step1_4 = _mm_sub_epi16(in03, in12); step1_5 = _mm_sub_epi16(in02, in13); step1_6 = _mm_sub_epi16(in01, in14); step1_7 = _mm_sub_epi16(in00, in15); } // Work on the first eight values; fdct8_1d(input, even_results); { // Add/substract const __m128i q0 = _mm_add_epi16(input0, input7); const __m128i q1 = _mm_add_epi16(input1, input6); const __m128i q2 = _mm_add_epi16(input2, input5); const __m128i q3 = _mm_add_epi16(input3, input4); const __m128i q4 = _mm_sub_epi16(input3, input4); const __m128i q5 = _mm_sub_epi16(input2, input5); const __m128i q6 = _mm_sub_epi16(input1, input6); const __m128i q7 = _mm_sub_epi16(input0, input7); // Work on first four results { // Add/substract const __m128i r0 = _mm_add_epi16(q0, q3); const __m128i r1 = _mm_add_epi16(q1, q2); const __m128i r2 = _mm_sub_epi16(q1, q2); const __m128i r3 = _mm_sub_epi16(q0, q3); // Interleave to do the multiply by constants which gets us // into 32 bits. const __m128i t0 = _mm_unpacklo_epi16(r0, r1); const __m128i t1 = _mm_unpackhi_epi16(r0, r1); const __m128i t2 = _mm_unpacklo_epi16(r2, r3); const __m128i t3 = _mm_unpackhi_epi16(r2, r3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); // Combine res00 = _mm_packs_epi32(w0, w1); res08 = _mm_packs_epi32(w2, w3); res04 = _mm_packs_epi32(w4, w5); res12 = _mm_packs_epi32(w6, w7); } // Work on next four results { // Interleave to do the multiply by constants which gets us // into 32 bits. const __m128i d0 = _mm_unpacklo_epi16(q6, q5); const __m128i d1 = _mm_unpackhi_epi16(q6, q5); const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); // dct_const_round_shift const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); // Combine const __m128i r0 = _mm_packs_epi32(s0, s1); const __m128i r1 = _mm_packs_epi32(s2, s3); // Add/substract const __m128i x0 = _mm_add_epi16(q4, r0); const __m128i x1 = _mm_sub_epi16(q4, r0); const __m128i x2 = _mm_sub_epi16(q7, r1); const __m128i x3 = _mm_add_epi16(q7, r1); // Interleave to do the multiply by constants which gets us // into 32 bits. const __m128i t0 = _mm_unpacklo_epi16(x0, x3); const __m128i t1 = _mm_unpackhi_epi16(x0, x3); const __m128i t2 = _mm_unpacklo_epi16(x1, x2); const __m128i t3 = _mm_unpackhi_epi16(x1, x2); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); // Combine res02 = _mm_packs_epi32(w0, w1); res14 = _mm_packs_epi32(w2, w3); res10 = _mm_packs_epi32(w4, w5); res06 = _mm_packs_epi32(w6, w7); } } // Work on the next eight values; step1 -> odd_results { // step 2 { const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine step2_2 = _mm_packs_epi32(w0, w1); step2_3 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine step2_5 = _mm_packs_epi32(w0, w1); step2_4 = _mm_packs_epi32(w2, w3); } // step 3 { step3_0 = _mm_add_epi16(step1_0, step2_3); step3_1 = _mm_add_epi16(step1_1, step2_2); step3_2 = _mm_sub_epi16(step1_1, step2_2); step3_3 = _mm_sub_epi16(step1_0, step2_3); step3_4 = _mm_sub_epi16(step1_7, step2_4); step3_5 = _mm_sub_epi16(step1_6, step2_5); step3_6 = _mm_add_epi16(step1_6, step2_5); step3_7 = _mm_add_epi16(step1_7, step2_4); } // step 4 { const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine step2_1 = _mm_packs_epi32(w0, w1); step2_2 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine step2_6 = _mm_packs_epi32(w0, w1); step2_5 = _mm_packs_epi32(w2, w3); } // step 5 { step1_0 = _mm_add_epi16(step3_0, step2_1); step1_1 = _mm_sub_epi16(step3_0, step2_1); step1_2 = _mm_sub_epi16(step3_3, step2_2); step1_3 = _mm_add_epi16(step3_3, step2_2); step1_4 = _mm_add_epi16(step3_4, step2_5); step1_5 = _mm_sub_epi16(step3_4, step2_5); step1_6 = _mm_sub_epi16(step3_7, step2_6); step1_7 = _mm_add_epi16(step3_7, step2_6); } // step 6 { const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine res01 = _mm_packs_epi32(w0, w1); res09 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine res05 = _mm_packs_epi32(w0, w1); res13 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine res11 = _mm_packs_epi32(w0, w1); res03 = _mm_packs_epi32(w2, w3); } { const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30); const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14); const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); // Combine res15 = _mm_packs_epi32(w0, w1); res07 = _mm_packs_epi32(w2, w3); } } // Transpose the results, do it as two 8x8 transposes. { // 00 01 02 03 04 05 06 07 // 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 // 30 31 32 33 34 35 36 37 // 40 41 42 43 44 45 46 47 // 50 51 52 53 54 55 56 57 // 60 61 62 63 64 65 66 67 // 70 71 72 73 74 75 76 77 const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01); const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03); const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01); const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03); const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05); const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07); const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05); const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 // 04 14 05 15 06 16 07 17 // 24 34 25 35 26 36 27 37 // 40 50 41 51 42 52 43 53 // 60 70 61 71 62 72 63 73 // 54 54 55 55 56 56 57 57 // 64 74 65 75 66 76 67 77 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); // 00 10 20 30 01 11 21 31 // 40 50 60 70 41 51 61 71 // 02 12 22 32 03 13 23 33 // 42 52 62 72 43 53 63 73 // 04 14 24 34 05 15 21 36 // 44 54 64 74 45 55 61 76 // 06 16 26 36 07 17 27 37 // 46 56 66 76 47 57 67 77 const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); // 00 10 20 30 40 50 60 70 // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 // 03 13 23 33 43 53 63 73 // 04 14 24 34 44 54 64 74 // 05 15 25 35 45 55 65 75 // 06 16 26 36 46 56 66 76 // 07 17 27 37 47 57 67 77 _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0); _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1); _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2); _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3); _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4); _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5); _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6); _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7); } { // 00 01 02 03 04 05 06 07 // 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 // 30 31 32 33 34 35 36 37 // 40 41 42 43 44 45 46 47 // 50 51 52 53 54 55 56 57 // 60 61 62 63 64 65 66 67 // 70 71 72 73 74 75 76 77 const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09); const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11); const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09); const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11); const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13); const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15); const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13); const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 // 04 14 05 15 06 16 07 17 // 24 34 25 35 26 36 27 37 // 40 50 41 51 42 52 43 53 // 60 70 61 71 62 72 63 73 // 54 54 55 55 56 56 57 57 // 64 74 65 75 66 76 67 77 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); // 00 10 20 30 01 11 21 31 // 40 50 60 70 41 51 61 71 // 02 12 22 32 03 13 23 33 // 42 52 62 72 43 53 63 73 // 04 14 24 34 05 15 21 36 // 44 54 64 74 45 55 61 76 // 06 16 26 36 07 17 27 37 // 46 56 66 76 47 57 67 77 const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); // 00 10 20 30 40 50 60 70 // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 // 03 13 23 33 43 53 63 73 // 04 14 24 34 44 54 64 74 // 05 15 25 35 45 55 65 75 // 06 16 26 36 46 56 66 76 // 07 17 27 37 47 57 67 77 // Store results _mm_storeu_si128((__m128i *)(out + 8 + 0 * 16), tr2_0); _mm_storeu_si128((__m128i *)(out + 8 + 1 * 16), tr2_1); _mm_storeu_si128((__m128i *)(out + 8 + 2 * 16), tr2_2); _mm_storeu_si128((__m128i *)(out + 8 + 3 * 16), tr2_3); _mm_storeu_si128((__m128i *)(out + 8 + 4 * 16), tr2_4); _mm_storeu_si128((__m128i *)(out + 8 + 5 * 16), tr2_5); _mm_storeu_si128((__m128i *)(out + 8 + 6 * 16), tr2_6); _mm_storeu_si128((__m128i *)(out + 8 + 7 * 16), tr2_7); } out += 8*16; } // Setup in/out for next pass. in = intermediate; out = output; } }
void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose // the results. In the second one, we transform the rows. To achieve that, // as the first pass results are transposed, we tranpose the columns (that // is the transposed rows) and transpose the results (so that it goes back // in normal/row positions). const int stride = pitch >> 1; int pass; // Constants // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); const __m128i kOne = _mm_set1_epi16(1); __m128i in0, in1, in2, in3; // Load inputs. { in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); // x = x << 4 in0 = _mm_slli_epi16(in0, 4); in1 = _mm_slli_epi16(in1, 4); in2 = _mm_slli_epi16(in2, 4); in3 = _mm_slli_epi16(in3, 4); // if (i == 0 && input[0]) input[0] += 1; { // The mask will only contain wether the first value is zero, all // other comparison will fail as something shifted by 4 (above << 4) // can never be equal to one. To increment in the non-zero case, we // add the mask and one for the first element: // - if zero, mask = -1, v = v - 1 + 1 = v // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); in0 = _mm_add_epi16(in0, mask); in0 = _mm_add_epi16(in0, k__nonzero_bias_b); } } // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { // Transform 1/2: Add/substract const __m128i r0 = _mm_add_epi16(in0, in3); const __m128i r1 = _mm_add_epi16(in1, in2); const __m128i r2 = _mm_sub_epi16(in1, in2); const __m128i r3 = _mm_sub_epi16(in0, in3); // Transform 1/2: Interleave to do the multiply by constants which gets us // into 32 bits. const __m128i t0 = _mm_unpacklo_epi16(r0, r1); const __m128i t2 = _mm_unpacklo_epi16(r2, r3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); // Combine and transpose const __m128i res0 = _mm_packs_epi32(w0, w2); const __m128i res1 = _mm_packs_epi32(w4, w6); // 00 01 02 03 20 21 22 23 // 10 11 12 13 30 31 32 33 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); in2 = _mm_unpackhi_epi32(tr0_0, tr0_1); // 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1 // 02 12 22 32 03 13 23 33 in2 contains 2 followed by 3 if (0 == pass) { // Extract values in the high part for second pass as transform code // only uses the first four values. in1 = _mm_unpackhi_epi64(in0, in0); in3 = _mm_unpackhi_epi64(in2, in2); } else { // Post-condition output and store it (v + 1) >> 2, taking advantage // of the fact 1/3 are stored just after 0/2. __m128i out01 = _mm_add_epi16(in0, kOne); __m128i out23 = _mm_add_epi16(in2, kOne); out01 = _mm_srai_epi16(out01, 2); out23 = _mm_srai_epi16(out23, 2); _mm_storeu_si128((__m128i *)(output + 0 * 4), out01); _mm_storeu_si128((__m128i *)(output + 2 * 4), out23); } } }
void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) { const int stride = pitch >> 1; int pass; // Constants // When we use them, in one case, they are all the same. In all others // it's a pair of them that we need to repeat four times. This is done // by constructing the 32 bit constant corresponding to that pair. const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); // Load input __m128i in0 = _mm_loadu_si128((const __m128i *)(input + 0 * stride)); __m128i in1 = _mm_loadu_si128((const __m128i *)(input + 1 * stride)); __m128i in2 = _mm_loadu_si128((const __m128i *)(input + 2 * stride)); __m128i in3 = _mm_loadu_si128((const __m128i *)(input + 3 * stride)); __m128i in4 = _mm_loadu_si128((const __m128i *)(input + 4 * stride)); __m128i in5 = _mm_loadu_si128((const __m128i *)(input + 5 * stride)); __m128i in6 = _mm_loadu_si128((const __m128i *)(input + 6 * stride)); __m128i in7 = _mm_loadu_si128((const __m128i *)(input + 7 * stride)); // Pre-condition input (shift by two) in0 = _mm_slli_epi16(in0, 2); in1 = _mm_slli_epi16(in1, 2); in2 = _mm_slli_epi16(in2, 2); in3 = _mm_slli_epi16(in3, 2); in4 = _mm_slli_epi16(in4, 2); in5 = _mm_slli_epi16(in5, 2); in6 = _mm_slli_epi16(in6, 2); in7 = _mm_slli_epi16(in7, 2); // We do two passes, first the columns, then the rows. The results of the // first pass are transposed so that the same column code can be reused. The // results of the second pass are also transposed so that the rows (processed // as columns) are put back in row positions. for (pass = 0; pass < 2; pass++) { // To store results of each pass before the transpose. __m128i res0, res1, res2, res3, res4, res5, res6, res7; // Add/substract const __m128i q0 = _mm_add_epi16(in0, in7); const __m128i q1 = _mm_add_epi16(in1, in6); const __m128i q2 = _mm_add_epi16(in2, in5); const __m128i q3 = _mm_add_epi16(in3, in4); const __m128i q4 = _mm_sub_epi16(in3, in4); const __m128i q5 = _mm_sub_epi16(in2, in5); const __m128i q6 = _mm_sub_epi16(in1, in6); const __m128i q7 = _mm_sub_epi16(in0, in7); // Work on first four results { // Add/substract const __m128i r0 = _mm_add_epi16(q0, q3); const __m128i r1 = _mm_add_epi16(q1, q2); const __m128i r2 = _mm_sub_epi16(q1, q2); const __m128i r3 = _mm_sub_epi16(q0, q3); // Interleave to do the multiply by constants which gets us into 32bits const __m128i t0 = _mm_unpacklo_epi16(r0, r1); const __m128i t1 = _mm_unpackhi_epi16(r0, r1); const __m128i t2 = _mm_unpacklo_epi16(r2, r3); const __m128i t3 = _mm_unpackhi_epi16(r2, r3); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); // Combine res0 = _mm_packs_epi32(w0, w1); res4 = _mm_packs_epi32(w2, w3); res2 = _mm_packs_epi32(w4, w5); res6 = _mm_packs_epi32(w6, w7); } // Work on next four results { // Interleave to do the multiply by constants which gets us into 32bits const __m128i d0 = _mm_unpacklo_epi16(q6, q5); const __m128i d1 = _mm_unpackhi_epi16(q6, q5); const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); // dct_const_round_shift const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); // Combine const __m128i r0 = _mm_packs_epi32(s0, s1); const __m128i r1 = _mm_packs_epi32(s2, s3); // Add/substract const __m128i x0 = _mm_add_epi16(q4, r0); const __m128i x1 = _mm_sub_epi16(q4, r0); const __m128i x2 = _mm_sub_epi16(q7, r1); const __m128i x3 = _mm_add_epi16(q7, r1); // Interleave to do the multiply by constants which gets us into 32bits const __m128i t0 = _mm_unpacklo_epi16(x0, x3); const __m128i t1 = _mm_unpackhi_epi16(x0, x3); const __m128i t2 = _mm_unpacklo_epi16(x1, x2); const __m128i t3 = _mm_unpackhi_epi16(x1, x2); const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); // dct_const_round_shift const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); // Combine res1 = _mm_packs_epi32(w0, w1); res7 = _mm_packs_epi32(w2, w3); res5 = _mm_packs_epi32(w4, w5); res3 = _mm_packs_epi32(w6, w7); } // Transpose the 8x8. { // 00 01 02 03 04 05 06 07 // 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 // 30 31 32 33 34 35 36 37 // 40 41 42 43 44 45 46 47 // 50 51 52 53 54 55 56 57 // 60 61 62 63 64 65 66 67 // 70 71 72 73 74 75 76 77 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 // 04 14 05 15 06 16 07 17 // 24 34 25 35 26 36 27 37 // 40 50 41 51 42 52 43 53 // 60 70 61 71 62 72 63 73 // 54 54 55 55 56 56 57 57 // 64 74 65 75 66 76 67 77 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); // 00 10 20 30 01 11 21 31 // 40 50 60 70 41 51 61 71 // 02 12 22 32 03 13 23 33 // 42 52 62 72 43 53 63 73 // 04 14 24 34 05 15 21 36 // 44 54 64 74 45 55 61 76 // 06 16 26 36 07 17 27 37 // 46 56 66 76 47 57 67 77 in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); // 00 10 20 30 40 50 60 70 // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 // 03 13 23 33 43 53 63 73 // 04 14 24 34 44 54 64 74 // 05 15 25 35 45 55 65 75 // 06 16 26 36 46 56 66 76 // 07 17 27 37 47 57 67 77 } } // Post-condition output and store it { // Post-condition (division by two) // division of two 16 bits signed numbers using shifts // n / 2 = (n - (n >> 15)) >> 1 const __m128i sign_in0 = _mm_srai_epi16(in0, 15); const __m128i sign_in1 = _mm_srai_epi16(in1, 15); const __m128i sign_in2 = _mm_srai_epi16(in2, 15); const __m128i sign_in3 = _mm_srai_epi16(in3, 15); const __m128i sign_in4 = _mm_srai_epi16(in4, 15); const __m128i sign_in5 = _mm_srai_epi16(in5, 15); const __m128i sign_in6 = _mm_srai_epi16(in6, 15); const __m128i sign_in7 = _mm_srai_epi16(in7, 15); in0 = _mm_sub_epi16(in0, sign_in0); in1 = _mm_sub_epi16(in1, sign_in1); in2 = _mm_sub_epi16(in2, sign_in2); in3 = _mm_sub_epi16(in3, sign_in3); in4 = _mm_sub_epi16(in4, sign_in4); in5 = _mm_sub_epi16(in5, sign_in5); in6 = _mm_sub_epi16(in6, sign_in6); in7 = _mm_sub_epi16(in7, sign_in7); in0 = _mm_srai_epi16(in0, 1); in1 = _mm_srai_epi16(in1, 1); in2 = _mm_srai_epi16(in2, 1); in3 = _mm_srai_epi16(in3, 1); in4 = _mm_srai_epi16(in4, 1); in5 = _mm_srai_epi16(in5, 1); in6 = _mm_srai_epi16(in6, 1); in7 = _mm_srai_epi16(in7, 1); // store results _mm_storeu_si128((__m128i *)(output + 0 * 8), in0); _mm_storeu_si128((__m128i *)(output + 1 * 8), in1); _mm_storeu_si128((__m128i *)(output + 2 * 8), in2); _mm_storeu_si128((__m128i *)(output + 3 * 8), in3); _mm_storeu_si128((__m128i *)(output + 4 * 8), in4); _mm_storeu_si128((__m128i *)(output + 5 * 8), in5); _mm_storeu_si128((__m128i *)(output + 6 * 8), in6); _mm_storeu_si128((__m128i *)(output + 7 * 8), in7); } }
static void SinCos(const float rad, float &sin, float &cos) // #include <emmintrin.h>, #include <xmmintrin.h> { const __m128 _ps_fopi = _mm_set1_ps(4.0f / pi); const __m128 _ps_0p5 = _mm_set1_ps(0.5f); const __m128 _ps_1 = _mm_set1_ps(1.0f); const __m128 _ps_dp1 = _mm_set1_ps(-0.7851562f); const __m128 _ps_dp2 = _mm_set1_ps(-2.4187564849853515625e-4f); const __m128 _ps_dp3 = _mm_set1_ps(-3.77489497744594108e-8f); const __m128 _ps_sincof_p0 = _mm_set1_ps(2.443315711809948e-5f); const __m128 _ps_sincof_p1 = _mm_set1_ps(8.3321608736e-3f); const __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611e-1f); const __m128 _ps_coscof_p0 = _mm_set1_ps(2.443315711809948e-5f); const __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765e-3f); const __m128 _ps_coscof_p2 = _mm_set1_ps(4.166664568298827e-2f); const __m128i _pi32_1 = _mm_set1_epi32(1); const __m128i _pi32_i1 = _mm_set1_epi32(~1); const __m128i _pi32_2 = _mm_set1_epi32(2); const __m128i _pi32_4 = _mm_set1_epi32(4); const __m128 _mask_sign_raw = _mm_castsi128_ps(_mm_set1_epi32( 0x80000000)); const __m128 _mask_sign_inv = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000)); __m128 mm1, mm2; __m128i mmi0, mmi2, mmi4; __m128 x, y, z; __m128 y1, y2; __m128 a = _mm_set1_ps(rad); x = _mm_and_ps(a, _mask_sign_inv); y = _mm_mul_ps(x, _ps_fopi); mmi2 = _mm_cvtps_epi32(y); mmi2 = _mm_add_epi32(mmi2, _pi32_1); mmi2 = _mm_and_si128(mmi2, _pi32_i1); y = _mm_cvtepi32_ps(mmi2); mmi4 = mmi2; mmi0 = _mm_and_si128(mmi2, _pi32_4); mmi0 = _mm_slli_epi32(mmi0, 29); __m128 swap_sign_bit_sin = _mm_castsi128_ps(mmi0); mmi2 = _mm_and_si128(mmi2, _pi32_2); mmi2 = _mm_cmpeq_epi32(mmi2, _mm_setzero_si128()); __m128 poly_mask = _mm_castsi128_ps(mmi2); x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp1)); x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp2)); x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp3)); mmi4 = _mm_sub_epi32(mmi4, _pi32_2); mmi4 = _mm_andnot_si128(mmi4, _pi32_4); mmi4 = _mm_slli_epi32(mmi4, 29); __m128 sign_bit_cos = _mm_castsi128_ps(mmi4); __m128 sign_bit_sin = _mm_xor_ps(_mm_and_ps(a, _mask_sign_raw), swap_sign_bit_sin); z = _mm_mul_ps(x, x); y1 = _mm_mul_ps(_ps_coscof_p0, z); y1 = _mm_add_ps(y1, _ps_coscof_p1); y1 = _mm_mul_ps(y1, z); y1 = _mm_add_ps(y1, _ps_coscof_p2); y1 = _mm_mul_ps(y1, z); y1 = _mm_mul_ps(y1, z); y1 = _mm_sub_ps(y1, _mm_mul_ps(z, _ps_0p5)); y1 = _mm_add_ps(y1, _ps_1); y2 = _mm_mul_ps(_ps_sincof_p0, z); y2 = _mm_add_ps(y2, _ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, _ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); __m128 sin1y = _mm_andnot_ps(poly_mask, y1); __m128 sin2y = _mm_and_ps(poly_mask, y2); mm1 = _mm_add_ps(sin1y, sin2y); mm2 = _mm_add_ps(_mm_sub_ps(y1, sin1y), _mm_sub_ps(y2, sin2y)); sin = _mm_cvtss_f32(_mm_xor_ps(mm1, sign_bit_sin)); cos = _mm_cvtss_f32(_mm_xor_ps(mm2, sign_bit_cos)); }
bool scanhash_sse2_32(struct thr_info*thr, const unsigned char *pmidstate, unsigned char *pdata, unsigned char *phash1, unsigned char *phash, const unsigned char *ptarget, uint32_t max_nonce, uint32_t *last_nonce, uint32_t nonce) { uint32_t *hash32 = (uint32_t *)phash; uint32_t *nNonce_p = (uint32_t *)(pdata + 76); uint32_t m_midstate[8], m_w[16], m_w1[16]; __m128i m_4w[64] __attribute__ ((aligned (0x100))); __m128i m_4hash[64] __attribute__ ((aligned (0x100))); __m128i m_4hash1[64] __attribute__ ((aligned (0x100))); __m128i offset; int i; pdata += 64; /* Message expansion */ memcpy(m_midstate, pmidstate, sizeof(m_midstate)); memcpy(m_w, pdata, sizeof(m_w)); /* The 2nd half of the data */ memcpy(m_w1, phash1, sizeof(m_w1)); memset(m_4hash, 0, sizeof(m_4hash)); /* Transmongrify */ for (i = 0; i < 16; i++) m_4w[i] = _mm_set1_epi32(m_w[i]); for (i = 0; i < 16; i++) m_4hash1[i] = _mm_set1_epi32(m_w1[i]); for (i = 0; i < 64; i++) sha256_consts_m128i[i] = _mm_set1_epi32(g_sha256_k[i]); offset = _mm_set_epi32(0x3, 0x2, 0x1, 0x0); for (;;) { int j; m_4w[3] = _mm_add_epi32(offset, _mm_set1_epi32(nonce)); /* Some optimization can be done here W.R.T. precalculating some hash */ CalcSha256_x86 (m_4hash1, m_4w, m_midstate); CalcSha256_x86 (m_4hash, m_4hash1, sha256_32init); for (j = 0; j < 4; j++) { if (unlikely(((uint32_t *)&(m_4hash[7]))[j] == 0)) { /* We found a hit...so check it */ /* Use the C version for a check... */ for (i = 0; i < 8; i++) { *(uint32_t *)&(phash)[i<<2] = ((uint32_t *)&(m_4hash[i]))[j]; } if (unlikely(hash32[7] == 0 && fulltest(phash, ptarget))) { nonce += j; *last_nonce = nonce; *nNonce_p = nonce; return true; } } } if (unlikely((nonce >= max_nonce) || thr->work_restart)) { *last_nonce = nonce; return false; } nonce += 4; } }
/// CURRENTLY SAME CODE AS SCALAR !! /// REPLACE HERE WITH SSE intrinsics static void partialButterflyInverse16_simd(short *src, short *dst, int shift) { int add = 1<<(shift-1); //we cast the original 16X16 matrix to an SIMD vector type __m128i *g_aiT16_vec = (__m128i *)g_aiT16; //We cast the input source (which is basically random numbers(see the main function for details)) to an SIMD vector type //We also cast the output to an SIMD vector type __m128i *in_vec = (__m128i *) src; __m128i *out_vec = (__m128i *) dst; //we declare an 8X8 array and cast it to an SIMD vector type short gt[8][8] __attribute__ ((aligned (16))); __m128i *gt_vec = (__m128i *)gt; //we declare an 16X16 array and cast it to an SIMD vector type short random[16][16] __attribute__ ((aligned (16))); __m128i *random_vec = (__m128i *)random; trans_g_aiT16(g_aiT16_vec,gt_vec); tranpose8x8(in_vec,2, random_vec,0); tranpose8x8(in_vec,3, random_vec,8); tranpose8x8(in_vec,0, random_vec,16); tranpose8x8(in_vec,1, random_vec,24); for (int j=0; j<16; j++) { /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ __m128i I0 = _mm_load_si128 (&random_vec[j]); __m128i II0 = _mm_load_si128 (&random_vec[j+16]); // for (int k=0; k<8; k++) //here we are loading up the transposed values in the initial matrix //multiplying it with the input numbers to produce intermediate 32-bit integers // we then sum up adjacent pairs of 32-bit integers and store them in the destination register __m128i I1 = _mm_load_si128 (>_vec[0]); __m128i I2 = _mm_madd_epi16 (I1, I0); __m128i I3 = _mm_load_si128 (>_vec[1]); __m128i I4 = _mm_madd_epi16 (I3, I0); __m128i I5 = _mm_load_si128 (>_vec[2]); __m128i I6 = _mm_madd_epi16 (I5, I0); __m128i I7 = _mm_load_si128 (>_vec[3]); __m128i I8 = _mm_madd_epi16 (I7, I0); __m128i I9 = _mm_load_si128 (>_vec[4]); __m128i I10 = _mm_madd_epi16 (I9, I0); __m128i I11 = _mm_load_si128 (>_vec[5]); __m128i I12 = _mm_madd_epi16 (I11, I0); __m128i I13 = _mm_load_si128 (>_vec[6]); __m128i I14 = _mm_madd_epi16 (I13, I0); __m128i I15 = _mm_load_si128 (>_vec[7]); __m128i I16 = _mm_madd_epi16 (I15, I0); //horizontally add the partial results obtained from thee previous step __m128i A1 =_mm_hadd_epi32 (I2, I4); __m128i A2 =_mm_hadd_epi32 (I6, I8); __m128i R1 =_mm_hadd_epi32 (A1, A2); __m128i A3 =_mm_hadd_epi32 (I10, I12); __m128i A4 =_mm_hadd_epi32 (I14, I16); __m128i R2 =_mm_hadd_epi32 (A3, A4); // O[k] = T[0]+T[1]+T[2]+T[3]; // for (int k=0; k<4; k++) // { //load the original matrix values, multiply it with the random values //store the low bits to I2 and the hi bits to I3 I1 = _mm_load_si128 (>_vec[8]); I2 = _mm_mullo_epi16 (I1, II0); I3 = _mm_mulhi_epi16 (I1, II0); __m128i lowI23 = _mm_unpacklo_epi16(I2,I3); __m128i hiI23 = _mm_unpackhi_epi16(I2,I3); __m128i temp1 = _mm_add_epi32(lowI23,hiI23); __m128i temp5 = _mm_hsub_epi32 (lowI23, hiI23); I4 = _mm_load_si128 (>_vec[9]); I5 = _mm_mullo_epi16 (I4, II0); I6 = _mm_mulhi_epi16 (I4, II0); __m128i lowI56 = _mm_unpacklo_epi16(I5,I6); __m128i hiI56 = _mm_unpackhi_epi16(I5,I6); __m128i temp2 = _mm_add_epi32(lowI56,hiI56); __m128i temp6 = _mm_hsub_epi32 (lowI56, hiI56); I7 = _mm_load_si128 (>_vec[10]); I8 = _mm_mullo_epi16 (I7, II0); I9 = _mm_mulhi_epi16 (I7, II0); __m128i lowI89 = _mm_unpacklo_epi16(I8,I9); __m128i hiI89 = _mm_unpackhi_epi16(I8,I9); __m128i temp3 = _mm_add_epi32(lowI89,hiI89); __m128i temp7 = _mm_hsub_epi32 (lowI89, hiI89); I10 = _mm_load_si128 (>_vec[11]); I11 = _mm_mullo_epi16 (I10, II0); I12 = _mm_mulhi_epi16 (I10, II0); __m128i lowI1112 = _mm_unpacklo_epi16(I11,I12); __m128i hiI1112 = _mm_unpackhi_epi16(I11,I12); __m128i temp4 = _mm_add_epi32(lowI1112,hiI1112); __m128i temp8 = _mm_hsub_epi32 (lowI1112, hiI1112); __m128i A5 =_mm_hadd_epi32 (temp1, temp2); __m128i A6 =_mm_hadd_epi32 (temp3, temp4); __m128i R3 =_mm_hadd_epi32 (A5, A6); __m128i A7 =_mm_hadd_epi32 (temp8, temp7); __m128i A8 =_mm_hadd_epi32 (temp6, temp5); __m128i R4 =_mm_hadd_epi32 (A7, A8); /////////////////////////// __m128i add_reg = _mm_set1_epi32(add); __m128i sum_vec0 = _mm_add_epi32(R3,R1); sum_vec0 = _mm_add_epi32(sum_vec0,add_reg); sum_vec0 = _mm_srai_epi32(sum_vec0, shift); // shift right __m128i sum_vec1 = _mm_add_epi32(R4,R2); sum_vec1 = _mm_add_epi32(sum_vec1,add_reg); sum_vec1 = _mm_srai_epi32(sum_vec1, shift); // shift right __m128i finalres0 = _mm_packs_epi32(sum_vec0, sum_vec1); // shrink packed 32bit to packed 16 bit and saturate _mm_store_si128 (&out_vec[2*j], finalres0); __m128i sum_vec2 = _mm_sub_epi32(R4, R2); sum_vec2 = _mm_add_epi32(sum_vec2,add_reg); sum_vec2 = _mm_srai_epi32(sum_vec2, shift); // shift right __m128i sum_vec3 = _mm_sub_epi32(R3, R1); sum_vec3 = _mm_add_epi32(sum_vec3,add_reg); sum_vec3 = _mm_srai_epi32(sum_vec3, shift); // shift right I5 = _mm_unpackhi_epi32(sum_vec2, sum_vec3); I6 = _mm_unpacklo_epi32(sum_vec2, sum_vec3); I7 = _mm_unpackhi_epi32(I5, I6); I8 = _mm_unpacklo_epi32(I5, I6); I9 = _mm_unpacklo_epi32(I7, I8); I10 = _mm_unpackhi_epi32(I7, I8); sum_vec3 = _mm_packs_epi32(I9, I10); // shrink packed 32bit to packed 16 bit and saturate _mm_store_si128 (&out_vec[2*j+1], sum_vec3); } }
int32_t * const restrict del_pr = _del_pr+PAD; #ifdef PARASAIL_TABLE parasail_result_t *result = parasail_result_new_table1(s1Len, s2Len); #else #ifdef PARASAIL_ROWCOL parasail_result_t *result = parasail_result_new_rowcol1(s1Len, s2Len); #else parasail_result_t *result = parasail_result_new(); #endif #endif int32_t i = 0; int32_t j = 0; int32_t end_query = 0; int32_t end_ref = 0; int32_t score = NEG_INF; __m128i vNegInf = _mm_set1_epi32(NEG_INF); __m128i vNegInf0 = _mm_srli_si128(vNegInf, 4); /* shift in a 0 */ __m128i vOpen = _mm_set1_epi32(open); __m128i vGap = _mm_set1_epi32(gap); __m128i vZero = _mm_set1_epi32(0); __m128i vOne = _mm_set1_epi32(1); __m128i vN = _mm_set1_epi32(N); __m128i vNegOne = _mm_set1_epi32(-1); __m128i vI = _mm_set_epi32(0,1,2,3); __m128i vJreset = _mm_set_epi32(0,-1,-2,-3); __m128i vMax = vNegInf; __m128i vEndI = vNegInf; __m128i vEndJ = vNegInf; __m128i vILimit = _mm_set1_epi32(s1Len); __m128i vJLimit = _mm_set1_epi32(s2Len);
static void GF_FUNC_ALIGN VS_CC proc_16bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t *p0 = (uint16_t *)buff + 8; uint16_t *p1 = p0 + bstride; uint16_t *p2 = p1 + bstride; uint16_t *p3 = p2 + bstride; uint16_t *p4 = p3 + bstride; uint16_t *orig = p0, *end = p4; line_copy16(p0, srcp + 2 * stride, width, 2); line_copy16(p1, srcp + stride, width, 2); line_copy16(p2, srcp, width, 2); srcp += stride; line_copy16(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128 rdiv = _mm_set1_ps((float)ch->rdiv); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i max = _mm_set1_epi32(0xFFFF); __m128 matrix[25]; for (int i = 0; i < 25; i++) { matrix[i] = _mm_set1_ps((float)ch->m[i]); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy16(p4, srcp, width, 2); uint16_t *array[] = { p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2, p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2, p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2, p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2, p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2 }; for (int x = 0; x < width; x += 8) { __m128 sum[2] = {(__m128)zero, (__m128)zero}; for (int i = 0; i < 25; i++) { __m128i xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x)); __m128 xmm1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm0, zero)); __m128 xmm2 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm0, zero)); xmm1 = _mm_mul_ps(xmm1, matrix[i]); xmm2 = _mm_mul_ps(xmm2, matrix[i]); sum[0] = _mm_add_ps(sum[0], xmm1); sum[1] = _mm_add_ps(sum[1], xmm2); } __m128i sumi[2]; for (int i = 0; i < 2; i++) { sum[i] = _mm_mul_ps(sum[i], rdiv); sum[i] = _mm_add_ps(sum[i], bias); if (!ch->saturate) { sum[i] = mm_abs_ps(sum[i]); } sumi[i] = _mm_cvtps_epi32(sum[i]); sumi[i] = mm_min_epi32(sumi[i], max); __m128i mask = _mm_cmpgt_epi32(sumi[i], zero); sumi[i] = _mm_and_si128(sumi[i], mask); } sumi[0] = mm_cast_epi32(sumi[0], sumi[1]); _mm_store_si128((__m128i *)(dstp + x), sumi[0]); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
/* Function: esl_sse_logf() * Synopsis: <r[z] = log x[z]> * Incept: SRE, Fri Dec 14 11:32:54 2007 [Janelia] * * Purpose: Given a vector <x> containing four floats, returns a * vector <r> in which each element <r[z] = logf(x[z])>. * * Valid in the domain $x_z > 0$ for normalized IEEE754 * $x_z$. * * For <x> $< 0$, including -0, returns <NaN>. For <x> $== * 0$ or subnormal <x>, returns <-inf>. For <x = inf>, * returns <inf>. For <x = NaN>, returns <NaN>. For * subnormal <x>, returns <-inf>. * * Xref: J2/71. * * Note: Derived from an SSE1 implementation by Julian * Pommier. Converted to SSE2 and added handling * of IEEE754 specials. */ __m128 esl_sse_logf(__m128 x) { static float cephes_p[9] = { 7.0376836292E-2f, -1.1514610310E-1f, 1.1676998740E-1f, -1.2420140846E-1f, 1.4249322787E-1f, -1.6668057665E-1f, 2.0000714765E-1f, -2.4999993993E-1f, 3.3333331174E-1f }; __m128 onev = _mm_set1_ps(1.0f); /* all elem = 1.0 */ __m128 v0p5 = _mm_set1_ps(0.5f); /* all elem = 0.5 */ __m128i vneg = _mm_set1_epi32(0x80000000); /* all elem have IEEE sign bit up */ __m128i vexp = _mm_set1_epi32(0x7f800000); /* all elem have IEEE exponent bits up */ __m128i ei; __m128 e; __m128 invalid_mask, zero_mask, inf_mask; /* masks used to handle special IEEE754 inputs */ __m128 mask; __m128 origx; __m128 tmp; __m128 y; __m128 z; /* first, split x apart: x = frexpf(x, &e); */ ei = _mm_srli_epi32( _mm_castps_si128(x), 23); /* shift right 23: IEEE754 floats: ei = biased exponents */ invalid_mask = _mm_castsi128_ps ( _mm_cmpeq_epi32( _mm_and_si128(_mm_castps_si128(x), vneg), vneg)); /* mask any elem that's negative; these become NaN */ zero_mask = _mm_castsi128_ps ( _mm_cmpeq_epi32(ei, _mm_setzero_si128())); /* mask any elem zero or subnormal; these become -inf */ inf_mask = _mm_castsi128_ps ( _mm_cmpeq_epi32( _mm_and_si128(_mm_castps_si128(x), vexp), vexp)); /* mask any elem inf or NaN; log(inf)=inf, log(NaN)=NaN */ origx = x; /* store original x, used for log(inf) = inf, log(NaN) = NaN */ x = _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(~0x7f800000))); /* x now the stored 23 bits of the 24-bit significand */ x = _mm_or_ps (x, v0p5); /* sets hidden bit b[0] */ ei = _mm_sub_epi32(ei, _mm_set1_epi32(126)); /* -127 (ei now signed base-2 exponent); then +1 */ e = _mm_cvtepi32_ps(ei); /* now, calculate the log */ mask = _mm_cmplt_ps(x, _mm_set1_ps(0.707106781186547524f)); /* avoid conditional branches. */ tmp = _mm_and_ps(x, mask); /* tmp contains x values < 0.707, else 0 */ x = _mm_sub_ps(x, onev); e = _mm_sub_ps(e, _mm_and_ps(onev, mask)); x = _mm_add_ps(x, tmp); z = _mm_mul_ps(x,x); y = _mm_set1_ps(cephes_p[0]); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[1])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[2])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[3])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[4])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[5])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[6])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[7])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[8])); y = _mm_mul_ps(y, x); y = _mm_mul_ps(y, z); tmp = _mm_mul_ps(e, _mm_set1_ps(-2.12194440e-4f)); y = _mm_add_ps(y, tmp); tmp = _mm_mul_ps(z, v0p5); y = _mm_sub_ps(y, tmp); tmp = _mm_mul_ps(e, _mm_set1_ps(0.693359375f)); x = _mm_add_ps(x, y); x = _mm_add_ps(x, tmp); /* IEEE754 cleanup: */ x = esl_sse_select_ps(x, origx, inf_mask); /* log(inf)=inf; log(NaN) = NaN */ x = _mm_or_ps(x, invalid_mask); /* log(x<0, including -0,-inf) = NaN */ x = esl_sse_select_ps(x, _mm_set1_ps(-eslINFINITY), zero_mask); /* x zero or subnormal = -inf */ return x; }
OD_SIMD_INLINE __m128i od_dct_mul_epi32(__m128i val, int32_t scale, int32_t offset, int32_t shift) { return _mm_srai_epi32(_mm_add_epi32(OD_MULLO_EPI32(val, scale), _mm_set1_epi32(offset)), shift); }
static void FTransformSSE2(const uint8_t* src, const uint8_t* ref, int16_t* out) { const __m128i zero = _mm_setzero_si128(); const __m128i seven = _mm_set1_epi16(7); const __m128i k7500 = _mm_set1_epi32(7500); const __m128i k14500 = _mm_set1_epi32(14500); const __m128i k51000 = _mm_set1_epi32(51000); const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217); const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352); __m128i v01, v32; // Difference between src and ref and initial transpose. { // Load src and convert to 16b. const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]); const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]); const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]); const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]); const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); // Load ref and convert to 16b. const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]); const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]); const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]); const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]); const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); // Compute difference. const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); // Transpose. // 00 01 02 03 0 0 0 0 // 10 11 12 13 0 0 0 0 // 20 21 22 23 0 0 0 0 // 30 31 32 33 0 0 0 0 const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1); const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // a02 a12 a22 a32 a03 a13 a23 a33 // a00 a10 a20 a30 a01 a11 a21 a31 // a03 a13 a23 a33 a02 a12 a22 a32 } // First pass and subsequent transpose. { // Same operations are done on the (0,3) and (1,2) pairs. // b0 = (a0 + a3) << 3 // b1 = (a1 + a2) << 3 // b3 = (a0 - a3) << 3 // b2 = (a1 - a2) << 3 const __m128i a01 = _mm_add_epi16(v01, v32); const __m128i a32 = _mm_sub_epi16(v01, v32); const __m128i b01 = _mm_slli_epi16(a01, 3); const __m128i b32 = _mm_slli_epi16(a32, 3); const __m128i b11 = _mm_unpackhi_epi64(b01, b01); const __m128i b22 = _mm_unpackhi_epi64(b32, b32); // e0 = b0 + b1 // e2 = b0 - b1 const __m128i e0 = _mm_add_epi16(b01, b11); const __m128i e2 = _mm_sub_epi16(b01, b11); const __m128i e02 = _mm_unpacklo_epi64(e0, e2); // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12 // e3 = (b3 * 2217 - b2 * 5352 + 7500) >> 12 const __m128i b23 = _mm_unpacklo_epi16(b22, b32); const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); const __m128i d1 = _mm_add_epi32(c1, k14500); const __m128i d3 = _mm_add_epi32(c3, k7500); const __m128i e1 = _mm_srai_epi32(d1, 12); const __m128i e3 = _mm_srai_epi32(d3, 12); const __m128i e13 = _mm_packs_epi32(e1, e3); // Transpose. // 00 01 02 03 20 21 22 23 // 10 11 12 13 30 31 32 33 const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13); const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 02 12 22 32 03 13 23 33 // 00 10 20 30 01 11 21 31 // 03 13 23 33 02 12 22 32 } // Second pass { // Same operations are done on the (0,3) and (1,2) pairs. // a0 = v0 + v3 // a1 = v1 + v2 // a3 = v0 - v3 // a2 = v1 - v2 const __m128i a01 = _mm_add_epi16(v01, v32); const __m128i a32 = _mm_sub_epi16(v01, v32); const __m128i a11 = _mm_unpackhi_epi64(a01, a01); const __m128i a22 = _mm_unpackhi_epi64(a32, a32); // d0 = (a0 + a1 + 7) >> 4; // d2 = (a0 - a1 + 7) >> 4; const __m128i b0 = _mm_add_epi16(a01, a11); const __m128i b2 = _mm_sub_epi16(a01, a11); const __m128i c0 = _mm_add_epi16(b0, seven); const __m128i c2 = _mm_add_epi16(b2, seven); const __m128i d0 = _mm_srai_epi16(c0, 4); const __m128i d2 = _mm_srai_epi16(c2, 4); // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) const __m128i b23 = _mm_unpacklo_epi16(a22, a32); const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); const __m128i d3 = _mm_add_epi32(c3, k51000); const __m128i e1 = _mm_srai_epi32(d1, 16); const __m128i e3 = _mm_srai_epi32(d3, 16); const __m128i f1 = _mm_packs_epi32(e1, e1); const __m128i f3 = _mm_packs_epi32(e3, e3); // f1 = f1 + (a3 != 0); // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the // desired (0, 1), we add one earlier through k12000_plus_one. const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); _mm_storel_epi64((__m128i*)&out[ 0], d0); _mm_storel_epi64((__m128i*)&out[ 4], g1); _mm_storel_epi64((__m128i*)&out[ 8], d2); _mm_storel_epi64((__m128i*)&out[12], f3); } }
static void aom_filter_block1d4_v4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i addFilterReg32; __m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45, srcReg6, srcReg56; __m128i srcReg23_34_lo, srcReg45_56_lo; __m128i srcReg2345_3456_lo, srcReg2345_3456_hi; __m128i resReglo, resReghi; __m128i firstFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg = _mm_srai_epi16(filtersReg, 1); filtersReg = _mm_packs_epi16(filtersReg, filtersReg); firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3); srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); // have consecutive loads on the same 256 register srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4); srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34); for (i = output_height; i > 1; i -= 2) { srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5); srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6); // merge every two consecutive registers srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56); srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo); srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo); // multiply 2 adjacent elements with the filter and add the result resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters); resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters); resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128()); resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128()); // shift by 6 bit each 16 bit resReglo = _mm_adds_epi16(resReglo, addFilterReg32); resReghi = _mm_adds_epi16(resReghi, addFilterReg32); resReglo = _mm_srai_epi16(resReglo, 6); resReghi = _mm_srai_epi16(resReghi, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result resReglo = _mm_packus_epi16(resReglo, resReglo); resReghi = _mm_packus_epi16(resReghi, resReghi); src_ptr += src_stride; *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(resReglo); *((uint32_t *)(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi); output_ptr += dst_stride; // save part of the registers for next strides srcReg23_34_lo = srcReg45_56_lo; srcReg4 = srcReg6; } }
pstatus_t sse2_set_32u( UINT32 val, UINT32 *pDst, INT32 len) { UINT32 *dptr = (UINT32 *) pDst; __m128i xmm0; size_t count; /* If really short, just do it here. */ if (len < 32) { while (len--) *dptr++ = val; return PRIMITIVES_SUCCESS; } /* Assure we can reach 16-byte alignment. */ if (((ULONG_PTR) dptr & 0x03) != 0) { return general_set_32u(val, pDst, len); } /* Seek 16-byte alignment. */ while ((ULONG_PTR) dptr & 0x0f) { *dptr++ = val; if (--len == 0) return PRIMITIVES_SUCCESS; } xmm0 = _mm_set1_epi32(val); /* Cover 256-byte chunks via SSE register stores. */ count = len >> 6; len -= count << 6; /* Do 256-byte chunks using one XMM register. */ while (count--) { _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; } /* Cover 16-byte chunks via SSE register stores. */ count = len >> 2; len -= count << 2; /* Do 16-byte chunks using one XMM register. */ while (count--) { _mm_store_si128((__m128i *) dptr, xmm0); dptr += 4; } /* Do leftover bytes. */ while (len--) *dptr++ = val; return PRIMITIVES_SUCCESS; }
void aom_filter_block1d8_h8_intrin_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; __m128i addFilterReg64, filtersReg, minReg; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits (first and second byte) // across 128 bit register firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 128 bit register secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 128 bit register thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 128 bit register forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); filt1Reg = _mm_load_si128((__m128i const *)filt1_global); filt2Reg = _mm_load_si128((__m128i const *)filt2_global); filt3Reg = _mm_load_si128((__m128i const *)filt3_global); filt4Reg = _mm_load_si128((__m128i const *)filt4_global); for (i = 0; i < output_height; i++) { srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg); srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); // filter the source buffer srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg); srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); // add and saturate all the results together minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); // shift by 7 bit each 16 bits srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); // shrink to 8 bit each 16 bits srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); src_ptr += src_pixels_per_line; // save only 8 bytes _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); output_ptr += output_pitch; } }
void lp_rast_triangle_3_16(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) { const struct lp_rast_triangle *tri = arg.triangle.tri; const struct lp_rast_plane *plane = GET_PLANES(tri); int x = (arg.triangle.plane_mask & 0xff) + task->x; int y = (arg.triangle.plane_mask >> 8) + task->y; unsigned i, j; struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16]; unsigned nr = 0; __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */ __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */ __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */ __m128i zero = _mm_setzero_si128(); __m128i c; __m128i dcdx; __m128i dcdy; __m128i rej4; __m128i dcdx2; __m128i dcdx3; __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ __m128i unused; transpose4_epi32(&p0, &p1, &p2, &zero, &c, &dcdx, &dcdy, &rej4); /* Adjust dcdx; */ dcdx = _mm_sub_epi32(zero, dcdx); c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); rej4 = _mm_slli_epi32(rej4, 2); /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */ c = _mm_sub_epi32(c, _mm_set1_epi32(1)); rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1)); dcdx2 = _mm_add_epi32(dcdx, dcdx); dcdx3 = _mm_add_epi32(dcdx2, dcdx); transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, &span_0, &span_1, &span_2, &unused); for (i = 0; i < 4; i++) { __m128i cx = c; for (j = 0; j < 4; j++) { __m128i c4rej = _mm_add_epi32(cx, rej4); __m128i rej_masks = _mm_srai_epi32(c4rej, 31); /* if (is_zero(rej_masks)) */ if (_mm_movemask_epi8(rej_masks) == 0) { __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0); __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1); __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2); __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0); __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0)); __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1)); __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2)); __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1); __m128i c_01 = _mm_packs_epi32(c_0, c_1); __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0)); __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1)); __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2)); __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2); __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0)); __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1)); __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2)); __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3); __m128i c_23 = _mm_packs_epi32(c_2, c_3); __m128i c_0123 = _mm_packs_epi16(c_01, c_23); unsigned mask = _mm_movemask_epi8(c_0123); out[nr].i = i; out[nr].j = j; out[nr].mask = mask; if (mask != 0xffff) nr++; } cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2)); } c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2)); } for (i = 0; i < nr; i++) lp_rast_shade_quads_mask(task, &tri->inputs, x + 4 * out[i].j, y + 4 * out[i].i, 0xffff & ~out[i].mask); }
__m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2; int vdwjidx0A,vdwjidx0B; __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0; __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00; __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10; __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20; __m128d velec,felec,velecsum,facel,crf,krf,krf2; real *charge; int nvdwtype; __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6; int *vdwtype; real *vdwparam; __m128d one_sixth = _mm_set1_pd(1.0/6.0); __m128d one_twelfth = _mm_set1_pd(1.0/12.0); __m128i vfitab; __m128i ifour = _mm_set1_epi32(4); __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF; real *vftab; __m128d dummy_mask,cutoff_mask; __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) ); __m128d one = _mm_set1_pd(1.0); __m128d two = _mm_set1_pd(2.0); x = xx[0]; f = ff[0]; nri = nlist->nri; iinr = nlist->iinr; jindex = nlist->jindex; jjnr = nlist->jjnr; shiftidx = nlist->shift; gid = nlist->gid;
void lp_rast_triangle_3_4(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) { const struct lp_rast_triangle *tri = arg.triangle.tri; const struct lp_rast_plane *plane = GET_PLANES(tri); unsigned x = (arg.triangle.plane_mask & 0xff) + task->x; unsigned y = (arg.triangle.plane_mask >> 8) + task->y; __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */ __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */ __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */ __m128i zero = _mm_setzero_si128(); __m128i c; __m128i dcdx; __m128i dcdy; __m128i dcdx2; __m128i dcdx3; __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ __m128i unused; transpose4_epi32(&p0, &p1, &p2, &zero, &c, &dcdx, &dcdy, &unused); /* Adjust dcdx; */ dcdx = _mm_sub_epi32(zero, dcdx); c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */ c = _mm_sub_epi32(c, _mm_set1_epi32(1)); dcdx2 = _mm_add_epi32(dcdx, dcdx); dcdx3 = _mm_add_epi32(dcdx2, dcdx); transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, &span_0, &span_1, &span_2, &unused); { __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0); __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1); __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2); __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0); __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0)); __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1)); __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2)); __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1); __m128i c_01 = _mm_packs_epi32(c_0, c_1); __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0)); __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1)); __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2)); __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2); __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0)); __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1)); __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2)); __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3); __m128i c_23 = _mm_packs_epi32(c_2, c_3); __m128i c_0123 = _mm_packs_epi16(c_01, c_23); unsigned mask = _mm_movemask_epi8(c_0123); if (mask != 0xffff) lp_rast_shade_quads_mask(task, &tri->inputs, x, y, 0xffff & ~mask); } }
int sse_auction_search(int *pr, int *P, int *ai0, int *ai1, int *a0, int *a1, int nodes, int arcs, int s, int t) { int i __attribute__ ((aligned (16))) = 0; int j __attribute__ ((aligned (16))) = t; int k __attribute__ ((aligned (16))) = 0; int m __attribute__ ((aligned (16))) = 0; int maxla __attribute__ ((aligned (32))) = 0; int argmaxla __attribute__ ((aligned (16))) = 0; int cost __attribute__ ((aligned (16))) = 0; int length __attribute__ ((aligned (16))) = 1; int path_cost __attribute__ ((aligned (16))) = 0; uint32_t tmp1, tmp2; int cost_tab[nodes+1]; __m128i a0sse, a1sse, ai0sse, ai1sse, ai1sse1, I, J, K, M, then; __m128i ARCS, MNODES, INFINITE, NEGINF, prsse, Psse, MAXLA, ARGMAXLA, LA, mask1, mask2, mask3, COST; for(i = 0; i <= nodes; i++) { cost_tab[i] = 0; } if(check_s_t(s, t, P, nodes) != 0) { return 1; } while(P[s] == INF) { k = -1; m = -1; //printf("j = %d\n", j); J = _mm_set1_epi32(j); //aktualna wartosc j K = _mm_set1_epi32(-1); //poczatkowy indeks w tablicy z kosztami krawedzi M = _mm_set1_epi32(-1); //koncowy indeks w tablicy z kosztami krawedzi MNODES = _mm_set1_epi32(nodes-1); //liczba wezlow pomniejszona o 1 (do sprawdzenia czy koniec tablicy) ARCS = _mm_set1_epi32(arcs); //liczba krawedzi /* wyliczenie k, m */ for(i = 0; i < nodes; i+=4) { ai0sse = _mm_load_si128((__m128i*) &ai0[i]); //ladowanie ai0 (numerow wezlow) ai1sse = _mm_load_si128((__m128i*) &ai1[i]); //ladowanie ai1 (indeksow w tablicy z krawedziami) ai1sse1 = _mm_set_epi32(ai1[i+4],ai1[i+3],ai1[i+2],ai1[i+1]); //ladowanie indeksow z ai1 przesunietych o 1 mask1 = _mm_cmpeq_epi32(J, ai0sse); //sprawdzenie warunku j == ai0[i] K = _mm_or_si128(_mm_and_si128(mask1,ai1sse), _mm_andnot_si128(mask1,K)); //ustalenie K I = _mm_set_epi32(i+3, i+2, i+1, i); //aktualne wartosci i mask2 = _mm_cmplt_epi32(I, MNODES); //sprawdzenie warunku i == nodes-1 mask3 = _mm_and_si128(mask1,mask2); //sprawdzenie sumy warunkow 1 i 2 then = _mm_or_si128(_mm_and_si128(mask2,ai1sse1), _mm_andnot_si128(mask2,ARCS)); //m = ai1[i+1] lub arcs M = _mm_or_si128(_mm_and_si128(mask3,then), _mm_andnot_si128(mask3,M)); //ustalenie M } for(i = 0; i < nodes; i++) { if(ai0[i] == j) { k = ai1[i]; //k - indeks startowy krawedzi wychodzacych z j //printf("i = %d ", i); if(i < nodes - 1) { m = ai1[i+1]; } else { m = arcs; } } } /* zapisanie k, m */ for(i = 0; i < 4; i++) { tmp1 = get_from_m128i(K,i); tmp2 = get_from_m128i(M,i); if(tmp1 != -1) { k = tmp1; } if(tmp2 != -1) { m = tmp2; } } //printf("K,M: %d %d\n", k, m); /* wybor optymalnej krawedzi */ if(k != -1) { INFINITE = _mm_set1_epi32(INF); //wartosc "nieskonczona" NEGINF = _mm_set1_epi32(0-INF); //wartosc -INF COST = _mm_set1_epi32(cost); //koszt wybranej krawedzi MAXLA = _mm_set1_epi32(0-INF); //maksymalna wartosc la = pr[a0[i]] - a1[i] ARGMAXLA = _mm_set1_epi32(-1); //indeks dla którego la jest najwieksza for(i = k; i < m; i+=4) { a1sse = _mm_set_epi32(a1[i],a1[i+1],a1[i+2],a1[i+3]); //ladowanie a1 a0sse = _mm_set_epi32(a0[i],a0[i+1],a0[i+2],a0[i+3]); //ladowanie a0 prsse = _mm_set_epi32(pr[a0[i]],pr[a0[i+1]],pr[a0[i+2]],pr[a0[i+3]]); //ladowanie pr Psse = _mm_set_epi32(P[a0[i]],P[a0[i+1]],P[a0[i+2]],P[a0[i+3]]); //ladowanie P mask1 = _mm_cmpgt_epi32(_mm_set1_epi32(m),_mm_set_epi32(i,i+1,i+2,i+3)); //czy ostatni obieg prsse = _mm_or_si128(_mm_and_si128(mask1,prsse), _mm_andnot_si128(mask1,NEGINF)); //obciecie cudzych lukow LA = _mm_sub_epi32(prsse, a1sse); //la = pr[a0[i]] - a1[i] then = _mm_max_epi32(LA,MAXLA); //maksymalna wartość la, maxla mask1 = _mm_cmpeq_epi32(Psse,INFINITE); //czy P[i] == INF mask2 = _mm_and_si128(mask1,_mm_cmpgt_epi32(LA,MAXLA)); //czy P[i] == INF i LA > MAXLA MAXLA = _mm_or_si128(_mm_and_si128(mask1,then), _mm_andnot_si128(mask1,MAXLA)); //aktualizacja maxla ARGMAXLA = _mm_or_si128(_mm_and_si128(mask2,a0sse), _mm_andnot_si128(mask2,ARGMAXLA)); //aktualizacja argmaxla COST = _mm_or_si128(_mm_and_si128(mask2,a1sse), _mm_andnot_si128(mask2,COST)); //aktualizacja cost } } /* zapisanie maxla, argmaxla, cost */ maxla = 0 - INF; for(i = 0; i < 4; i++) { tmp1 = get_from_m128i(MAXLA,i); if(tmp1 > maxla) { argmaxla = get_from_m128i(ARGMAXLA,i); maxla = tmp1; cost = get_from_m128i(COST,i); } } //printf("COST: %d, PATH_COST: %d\n", cost, path_cost); //printf("pr[j] = %d, maxla = %d, argmaxla = %d\n", pr[j], maxla, argmaxla); /* skrocenie sciezki */ if(pr[j] > maxla || maxla == -INF) { /* uaktualnienie ceny */ pr[j] = maxla; /* sciezka jednoelementowa nie jest skracana */ if(j != t) { /* uaktualnienie sciezki */ P[j] = INF; length = length - 1; path_cost = path_cost - cost_tab[length]; cost_tab[length] = 0; /* powrot do poprzedniego wierzcholka w sciezce (j), k - odcinany */ k = j; for(i = 0; i < nodes; i++) { if(P[i] == length - 1) { j = i; break; } } } } /* przedluzenie sciezki */ else { P[argmaxla] = length; j = argmaxla; path_cost = path_cost + cost; cost_tab[length] = cost; length = length + 1; /* sciezka doszla do wierzcholka startowego => koniec */ if(argmaxla == s) { printf("dlugosc sciezki: %d\n", path_cost); return 0; } } } return 0; }
RETi SET( const int &x ) { return _mm_set1_epi32(x); }