// Computes part of matrix.vector v = Wu. Computes N=8 results. // For details see PartialMatrixDotVector64 with N=8. static void PartialMatrixDotVector8(const int8_t* wi, const double* scales, const int8_t* u, int num_in, int num_out, double* v) { // Register containing 16-bit ones for horizontal add with 16->32 bit // conversion. __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1); // Initialize all the results to 0. __m256i result0 = _mm256_setzero_si256(); // Iterate over the input (u), one registerful at a time. for (int j = 0; j < num_in;) { __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(u + j)); // Inputs are processed in groups of kNumInputsPerGroup, replicated // kNumInputGroups times. for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) { // Replicate the low 32 bits (4 inputs) 8 times. __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs)); // Rotate the inputs in groups of 4, so the next 4 inputs are ready. inputs = _mm256_permutevar8x32_epi32(inputs, shift_id); __m256i weights, reps; // Mul-add, with horizontal add of the 4 inputs to each of the results. MultiplyGroup(rep_input, ones, wi, weights, reps, result0); } } ExtractResults(result0, shift_id, wi, scales, num_out, v); }
result_type operator()(T const&) const { result_type that = { _mm256_set_epi16( 15, 14, 13, 12, 11, 10, 9, 8 , 7, 6, 5, 4, 3, 2, 1, 0 ) }; return that; }
int main() { const ssize_t A = 3; const size_t Awidth = 2; const size_t Dwidth = 4; const ssize_t Dmin = (-1) * (1ll << (Dwidth - 1)); const ssize_t Dmax = (1ll << (Dwidth - 1)) - 1; const ssize_t Cwidth = Awidth + Dwidth; const ssize_t AInv = ext_euklidean(A, Cwidth) & ((1ll << Cwidth) - 1); const size_t numCodewords = (1ull << Cwidth); std::cout << "numCodewords: " << numCodewords << std::endl; const size_t numMasks = numCodewords / (sizeof(int) * 4); // How many masks will we generate? int * pNonCodewordMasks = new int[numMasks]; const int16_t c = ~((1ll << (Cwidth - 1)) - 1); std::cout << "c = 0x" << std::hex << c << std::dec << std::endl; for (ssize_t i = 0, cw = c, posMask = 0; i < numCodewords; ++posMask) { int tmpMask = 0; for (ssize_t k = 0; k < 16; ++k, ++cw, ++i) { if ((cw % A) != 0) { // we want the non-codewords // std::cout << "cw % A != 0: " << cw << std::endl; tmpMask |= (1ll << (k * 2)) | (1ll << (k * 2 + 1)); // expand to 32 bits, because AVX2 cannot movemask across lanes to 16 bits } } pNonCodewordMasks[posMask] = tmpMask; } std::cout << "numMasks: " << numMasks << std::endl; std::cout << "non-codeword-masks: 0x" << std::hex << std::setfill('0'); for (size_t posMask = 0; posMask < numMasks; ++posMask) { std::cout << std::setw(8) << pNonCodewordMasks[posMask] << ':'; } std::cout << std::dec << std::endl << std::setfill(' '); auto mmCodewords = _mm256_set_epi16(c+15, c+14, c+13, c+12, c+11, c+10, c+9, c+8, c+7, c+6, c+5, c+4, c+3, c+2, c+1, c); auto mmAddUp = _mm256_set1_epi16(16); auto mmAinv = _mm256_set1_epi16(AInv); auto mmDmin = _mm256_set1_epi16(Dmin); auto mmDmax = _mm256_set1_epi16(Dmax); const size_t posEnd = (1ull << Cwidth); __m256i mmFillUp[] = {_mm256_set1_epi16(0), _mm256_set1_epi16(~((1ll << Cwidth) - 1))}; // fill up all non-codeword bits with 1's if necessary std::cout << "posEnd = 0x" << std::hex << posEnd << std::dec << std::endl; std::cout << std::setfill('0') << std::hex; for(size_t pos = 15, posMask = 0; pos < posEnd; pos += 16, ++posMask) { auto isNeg = 0x1 & _mm256_movemask_epi8(_mm256_cmpgt_epi16(mmFillUp[0], mmCodewords)); auto mm1 = _mm256_or_si256(_mm256_mullo_epi16(mmCodewords, mmAinv), mmFillUp[isNeg]); auto mm2 = _mm256_cmpgt_epi16(mm1, mmDmin); auto mm3 = _mm256_cmpgt_epi16(mmDmax, mm1); auto mm4 = _mm256_cmpeq_epi16(mmDmax, mm1); auto mm5 = _mm256_or_si256(mm3, mm4); auto mm6 = _mm256_and_si256(mm2, mm5); auto mask = _mm256_movemask_epi8(mm6); if (mask & pNonCodewordMasks[posMask]) { std::cout << "BAD @0x" << std::setw((Cwidth + 7) / 8) << pos << ": 0x" << mask << " & 0x" << pNonCodewordMasks[posMask] << " = 0x" << (mask & pNonCodewordMasks[posMask]) << std::endl; } else { std::cout << "OK @0x" << std::setw((Cwidth + 7) / 8) << pos << ": 0x" << mask << " & 0x" << pNonCodewordMasks[posMask] << " = 0x" << (mask & pNonCodewordMasks[posMask]) << std::endl; } mmCodewords = _mm256_add_epi16(mmCodewords, mmAddUp); } std::cout << std::setfill(' ') << std::dec; }
static void avx2_test (void) { union256i_w u, s1, s2; short e[16]; int i; s1.x = _mm256_set_epi16 (1, 2, 3, 4, 10, 20, 30, 90, -80, -40, -100, 76, -100, -34, -78, -31000); s2.x = _mm256_set_epi16 (88, 44, 3, 22, 11, 98, 76, -100, -34, -78, 30, 90, -80, -40, -100, -15); u.x = _mm256_cmpgt_epi16 (s1.x, s2.x); for (i = 0; i < 16; i++) e[i] = (s1.a[i] > s2.a[i]) ? -1 : 0; if (check_union256i_w (u, e)) abort (); }
int8_t similar = NEG_LIMIT; int8_t length = NEG_LIMIT; __m256i vNegLimit = _mm256_set1_epi8(NEG_LIMIT); __m256i vPosLimit = _mm256_set1_epi8(POS_LIMIT); __m256i vSaturationCheckMin = vPosLimit; __m256i vSaturationCheckMax = vNegLimit; __m256i vNegInf = _mm256_set1_epi8(NEG_LIMIT); __m256i vNegInf0 = _mm256_srli_si256_rpl(vNegInf, 1); /* shift in a 0 */ __m256i vOpen = _mm256_set1_epi8(open); __m256i vGap = _mm256_set1_epi8(gap); __m256i vZero = _mm256_set1_epi8(0); __m256i vOne = _mm256_set1_epi8(1); __m256i vOne16 = _mm256_set1_epi16(1); __m256i vNegOne16 = _mm256_set1_epi16(-1); __m256i vN16 = _mm256_set1_epi16(N); __m256i vILo16 = _mm256_set_epi16(16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31); __m256i vIHi16 = _mm256_set_epi16(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); __m256i vJresetLo16 = _mm256_set_epi16(-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31); __m256i vJresetHi16 = _mm256_set_epi16(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15); __m256i vMaxH = vNegInf; __m256i vMaxM = vNegInf; __m256i vMaxS = vNegInf; __m256i vMaxL = vNegInf; __m256i vEndILo = vNegInf; __m256i vEndIHi = vNegInf; __m256i vEndJLo = vNegInf; __m256i vEndJHi = vNegInf; __m256i vILimit16 = _mm256_set1_epi16(s1Len); __m256i vJLimit16 = _mm256_set1_epi16(s2Len); /* convert _s1 from char to int in range 0-23 */
#endif #endif int32_t i = 0; int32_t j = 0; int16_t end_query = 0; int16_t end_ref = 0; int16_t score = NEG_INF; __m256i vNegInf = _mm256_set1_epi16(NEG_INF); __m256i vNegInf0 = _mm256_srli_si256_rpl(vNegInf, 2); /* shift in a 0 */ __m256i vOpen = _mm256_set1_epi16(open); __m256i vGap = _mm256_set1_epi16(gap); __m256i vZero = _mm256_set1_epi16(0); __m256i vOne = _mm256_set1_epi16(1); __m256i vN = _mm256_set1_epi16(N); __m256i vNegOne = _mm256_set1_epi16(-1); __m256i vI = _mm256_set_epi16(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); __m256i vJreset = _mm256_set_epi16(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15); __m256i vMax = vNegInf; __m256i vEndI = vNegInf; __m256i vEndJ = vNegInf; __m256i vILimit = _mm256_set1_epi16(s1Len); __m256i vJLimit = _mm256_set1_epi16(s2Len); /* convert _s1 from char to int in range 0-23 */ for (i=0; i<s1Len; ++i) { s1[i] = matrix->mapper[(unsigned char)_s1[i]]; } /* pad back of s1 with dummy values */ for (i=s1Len; i<s1Len_PAD; ++i) { s1[i] = 0; /* point to first matrix row because we don't care */
// AVX alignment warning: // Do not use this routine unless the scan lines in the bitmap are all aligned to AVX boundaries, // meaning that the bitmap base is aligned to a 32 byte boundary and the scan line "stride" is a // multiple of 32 bytes. Besides the performance loss of unaligned access, program testing on // current Intel hardware says that unaligned access triggers a fault from the processor. // // NOTE: Experience on my development laptop says this isn't much faster than SSE. However // I'm going to assume that's just my laptop, and that maybe in the future, AVX will // get faster. template <class T> void stretchblt_bilinear_avx(const rgb_bitmap_info &dbmp,const rgb_bitmap_info &sbmp) { #if HAVE_CPU_AVX2 // WARNING: This code assumes typical RGBA type packing where red and blue are NOT adjacent, and alpha and green are not adjacent nr_wfpack sx={0,0},sy={0,0},stepx,stepy; static vinterp_tmp<__m256i> vinterp_tmp; const T rbmask = (T)(dbmp.rgbinfo.r.mask+dbmp.rgbinfo.b.mask); const T abmask = (T)(dbmp.rgbinfo.g.mask+dbmp.rgbinfo.a.mask); __m256i rmask256,gmask256,bmask256,mul256; const size_t pixels_per_group = sizeof(__m256i) / sizeof(T); unsigned int src_bitmap_width_in_groups = (sbmp.width + pixels_per_group - 1) / pixels_per_group; unsigned char *drow; size_t ox,oy; T fshift; T pshift; T fmax; T mul; pshift = std::min(dbmp.rgbinfo.r.bwidth,std::min(dbmp.rgbinfo.g.bwidth,dbmp.rgbinfo.b.bwidth)); fshift = (sizeof(nr_wftype) * 8) - pshift; fmax = 1U << pshift; if (sizeof(T) == 4) { rmask256 = _mm256_set_epi16( 0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF, 0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF); } else { rmask256 = _mm256_set_epi16( dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask, dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask, dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask, dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask); gmask256 = _mm256_set_epi16( dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask, dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask, dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask, dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask); bmask256 = _mm256_set_epi16( dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask, dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask, dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask, dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask); } render_scale_from_sd(/*&*/stepx,dbmp.width,sbmp.width); render_scale_from_sd(/*&*/stepy,dbmp.height,sbmp.height); if (dbmp.width == 0 || src_bitmap_width_in_groups == 0) return; drow = dbmp.get_scanline<uint8_t>(0); oy = dbmp.height; do { T *s2 = sbmp.get_scanline<T>(sy.w+1); T *s = sbmp.get_scanline<T>(sy.w); T *d = (T*)drow; mul = (T)(sy.f >> fshift); { unsigned int m = (mul & (~1U)) << (15 - pshift); // 16-bit MMX multiply (signed bit), remove one bit to match precision mul256 = _mm256_set_epi16( m,m,m,m,m,m,m,m, m,m,m,m,m,m,m,m); } if (mul != 0) { if (stepx.w != 1 || stepx.f != 0) { // horizontal interpolation, vertical interpolation if (sizeof(T) == 4) stretchblt_line_bilinear_vinterp_stage_avx_argb8(vinterp_tmp.tmp,(__m256i*)s,(__m256i*)s2,mul256,src_bitmap_width_in_groups,rmask256); else stretchblt_line_bilinear_vinterp_stage_avx_rgb16(vinterp_tmp.tmp,(__m256i*)s,(__m256i*)s2,mul256,src_bitmap_width_in_groups, rmask256,dbmp.rgbinfo.r.shift, gmask256,dbmp.rgbinfo.g.shift, bmask256,dbmp.rgbinfo.b.shift); stretchblt_line_bilinear_hinterp_stage<T>(d,(T*)vinterp_tmp.tmp,sx,stepx,dbmp.width,rbmask,abmask,fmax,fshift,pshift); } else { // vertical interpolation only if (sizeof(T) == 4) stretchblt_line_bilinear_vinterp_stage_avx_argb8((__m256i*)d,(__m256i*)s,(__m256i*)s2,mul256,src_bitmap_width_in_groups,rmask256); else stretchblt_line_bilinear_vinterp_stage_avx_rgb16((__m256i*)d,(__m256i*)s,(__m256i*)s2,mul256,src_bitmap_width_in_groups, rmask256,dbmp.rgbinfo.r.shift, gmask256,dbmp.rgbinfo.g.shift, bmask256,dbmp.rgbinfo.b.shift); } } else { if (stepx.w != 1 || stepx.f != 0) { // horizontal interpolation, no vertical interpolation stretchblt_line_bilinear_hinterp_stage<T>(d,s,sx,stepx,dbmp.width,rbmask,abmask,fmax,fshift,pshift); } else { // copy the scanline 1:1 no interpolation memcpy(d,s,dbmp.width*sizeof(T)); } } if ((--oy) == 0) break; drow += dbmp.stride; sy += stepy; } while (1); #endif }