Exemplo n.º 1
0
// Computes part of matrix.vector v = Wu. Computes N=8 results.
// For details see PartialMatrixDotVector64 with N=8.
static void PartialMatrixDotVector8(const int8_t* wi, const double* scales,
                                    const int8_t* u, int num_in, int num_out,
                                    double* v) {
  // Register containing 16-bit ones for horizontal add with 16->32 bit
  // conversion.
  __m256i ones =
      _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
  // Initialize all the results to 0.
  __m256i result0 = _mm256_setzero_si256();
  // Iterate over the input (u), one registerful at a time.
  for (int j = 0; j < num_in;) {
    __m256i inputs =
        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(u + j));
    // Inputs are processed in groups of kNumInputsPerGroup, replicated
    // kNumInputGroups times.
    for (int ig = 0; ig < kNumInputGroups && j < num_in;
         ++ig, j += kNumInputsPerGroup) {
      // Replicate the low 32 bits (4 inputs) 8 times.
      __m256i rep_input =
          _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.
      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
      __m256i weights, reps;
      // Mul-add, with horizontal add of the 4 inputs to each of the results.
      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
    }
  }
  ExtractResults(result0, shift_id, wi, scales, num_out, v);
}
Exemplo n.º 2
0
Arquivo: arith.hpp Projeto: faldah/nt2
 result_type operator()(T const&) const
 {
   result_type that = { _mm256_set_epi16( 15, 14, 13, 12, 11, 10, 9, 8
                                        , 7,  6,  5,  4,  3,  2,  1, 0
                                        )
                      };
   return that;
 }
Exemplo n.º 3
0
int main() {
	const ssize_t A = 3;
	const size_t Awidth = 2;
	const size_t Dwidth = 4;
	const ssize_t Dmin = (-1) * (1ll << (Dwidth - 1));
	const ssize_t Dmax = (1ll << (Dwidth - 1)) - 1;
	const ssize_t Cwidth = Awidth + Dwidth;
	const ssize_t AInv = ext_euklidean(A, Cwidth) & ((1ll << Cwidth) - 1);
	const size_t numCodewords = (1ull << Cwidth);
	std::cout << "numCodewords: " << numCodewords << std::endl;
	const size_t numMasks = numCodewords / (sizeof(int) * 4); // How many masks will we generate?
	int * pNonCodewordMasks = new int[numMasks];
	const int16_t c = ~((1ll << (Cwidth - 1)) - 1);
	std::cout << "c = 0x" << std::hex << c << std::dec << std::endl;
	for (ssize_t i = 0, cw = c, posMask = 0; i < numCodewords; ++posMask) {
		int tmpMask = 0;
		for (ssize_t k = 0; k < 16; ++k, ++cw, ++i) {
			if ((cw % A) != 0) { // we want the non-codewords
				// std::cout << "cw % A != 0: " << cw << std::endl;
				tmpMask |= (1ll << (k * 2)) | (1ll << (k * 2 + 1)); // expand to 32 bits, because AVX2 cannot movemask across lanes to 16 bits
			}
		}
		pNonCodewordMasks[posMask] = tmpMask;
	}
	std::cout << "numMasks: " << numMasks << std::endl;
	std::cout << "non-codeword-masks: 0x" << std::hex << std::setfill('0');
	for (size_t posMask = 0; posMask < numMasks; ++posMask) {
		std::cout << std::setw(8) << pNonCodewordMasks[posMask] << ':';
	}
	std::cout << std::dec << std::endl << std::setfill(' ');
	auto mmCodewords = _mm256_set_epi16(c+15, c+14, c+13, c+12, c+11, c+10, c+9, c+8, c+7, c+6, c+5, c+4, c+3, c+2, c+1, c);
	auto mmAddUp = _mm256_set1_epi16(16);
	auto mmAinv = _mm256_set1_epi16(AInv);
	auto mmDmin = _mm256_set1_epi16(Dmin);
	auto mmDmax = _mm256_set1_epi16(Dmax);
	const size_t posEnd = (1ull << Cwidth);
	__m256i mmFillUp[] = {_mm256_set1_epi16(0), _mm256_set1_epi16(~((1ll << Cwidth) - 1))}; // fill up all non-codeword bits with 1's if necessary
	std::cout << "posEnd = 0x" << std::hex << posEnd << std::dec << std::endl;
	std::cout << std::setfill('0') << std::hex;
	for(size_t pos = 15, posMask = 0; pos < posEnd; pos += 16, ++posMask) {
		auto isNeg = 0x1 & _mm256_movemask_epi8(_mm256_cmpgt_epi16(mmFillUp[0], mmCodewords));
		auto mm1 = _mm256_or_si256(_mm256_mullo_epi16(mmCodewords, mmAinv), mmFillUp[isNeg]);
		auto mm2 = _mm256_cmpgt_epi16(mm1, mmDmin);
		auto mm3 = _mm256_cmpgt_epi16(mmDmax, mm1);
		auto mm4 = _mm256_cmpeq_epi16(mmDmax, mm1);
		auto mm5 = _mm256_or_si256(mm3, mm4);
		auto mm6 = _mm256_and_si256(mm2, mm5);
		auto mask = _mm256_movemask_epi8(mm6);
		if (mask & pNonCodewordMasks[posMask]) {
			std::cout << "BAD @0x" << std::setw((Cwidth + 7) / 8) << pos << ": 0x" << mask << " & 0x" << pNonCodewordMasks[posMask] << " = 0x" << (mask & pNonCodewordMasks[posMask]) << std::endl;
		} else {
			std::cout << "OK @0x" << std::setw((Cwidth + 7) / 8) << pos << ": 0x" << mask << " & 0x" << pNonCodewordMasks[posMask] << " = 0x" << (mask & pNonCodewordMasks[posMask]) << std::endl;
		}
		mmCodewords = _mm256_add_epi16(mmCodewords, mmAddUp);
	}
	std::cout << std::setfill(' ') << std::dec;
}
Exemplo n.º 4
0
static void
avx2_test (void)
{
  union256i_w u, s1, s2;
  short e[16];
  int i;

  s1.x = _mm256_set_epi16 (1, 2, 3, 4, 10, 20, 30, 90, -80, -40, -100,
			   76, -100, -34, -78, -31000);

  s2.x = _mm256_set_epi16 (88, 44, 3, 22, 11, 98, 76, -100, -34, -78,
			   30, 90, -80, -40, -100, -15);

  u.x = _mm256_cmpgt_epi16 (s1.x, s2.x);

  for (i = 0; i < 16; i++)
    e[i] = (s1.a[i] > s2.a[i]) ? -1 : 0;

  if (check_union256i_w (u, e))
    abort ();
}
Exemplo n.º 5
0
    int8_t similar = NEG_LIMIT;
    int8_t length = NEG_LIMIT;
    __m256i vNegLimit = _mm256_set1_epi8(NEG_LIMIT);
    __m256i vPosLimit = _mm256_set1_epi8(POS_LIMIT);
    __m256i vSaturationCheckMin = vPosLimit;
    __m256i vSaturationCheckMax = vNegLimit;
    __m256i vNegInf = _mm256_set1_epi8(NEG_LIMIT);
    __m256i vNegInf0 = _mm256_srli_si256_rpl(vNegInf, 1); /* shift in a 0 */
    __m256i vOpen = _mm256_set1_epi8(open);
    __m256i vGap  = _mm256_set1_epi8(gap);
    __m256i vZero = _mm256_set1_epi8(0);
    __m256i vOne = _mm256_set1_epi8(1);
    __m256i vOne16 = _mm256_set1_epi16(1);
    __m256i vNegOne16 = _mm256_set1_epi16(-1);
    __m256i vN16 = _mm256_set1_epi16(N);
    __m256i vILo16 = _mm256_set_epi16(16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31);
    __m256i vIHi16 = _mm256_set_epi16(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
    __m256i vJresetLo16 = _mm256_set_epi16(-16,-17,-18,-19,-20,-21,-22,-23,-24,-25,-26,-27,-28,-29,-30,-31);
    __m256i vJresetHi16 = _mm256_set_epi16(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15);
    __m256i vMaxH = vNegInf;
    __m256i vMaxM = vNegInf;
    __m256i vMaxS = vNegInf;
    __m256i vMaxL = vNegInf;
    __m256i vEndILo = vNegInf;
    __m256i vEndIHi = vNegInf;
    __m256i vEndJLo = vNegInf;
    __m256i vEndJHi = vNegInf;
    __m256i vILimit16 = _mm256_set1_epi16(s1Len);
    __m256i vJLimit16 = _mm256_set1_epi16(s2Len);

    /* convert _s1 from char to int in range 0-23 */
Exemplo n.º 6
0
#endif
#endif
    int32_t i = 0;
    int32_t j = 0;
    int16_t end_query = 0;
    int16_t end_ref = 0;
    int16_t score = NEG_INF;
    __m256i vNegInf = _mm256_set1_epi16(NEG_INF);
    __m256i vNegInf0 = _mm256_srli_si256_rpl(vNegInf, 2); /* shift in a 0 */
    __m256i vOpen = _mm256_set1_epi16(open);
    __m256i vGap  = _mm256_set1_epi16(gap);
    __m256i vZero = _mm256_set1_epi16(0);
    __m256i vOne = _mm256_set1_epi16(1);
    __m256i vN = _mm256_set1_epi16(N);
    __m256i vNegOne = _mm256_set1_epi16(-1);
    __m256i vI = _mm256_set_epi16(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
    __m256i vJreset = _mm256_set_epi16(0,-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15);
    __m256i vMax = vNegInf;
    __m256i vEndI = vNegInf;
    __m256i vEndJ = vNegInf;
    __m256i vILimit = _mm256_set1_epi16(s1Len);
    __m256i vJLimit = _mm256_set1_epi16(s2Len);
    

    /* convert _s1 from char to int in range 0-23 */
    for (i=0; i<s1Len; ++i) {
        s1[i] = matrix->mapper[(unsigned char)_s1[i]];
    }
    /* pad back of s1 with dummy values */
    for (i=s1Len; i<s1Len_PAD; ++i) {
        s1[i] = 0; /* point to first matrix row because we don't care */
// AVX alignment warning:
// Do not use this routine unless the scan lines in the bitmap are all aligned to AVX boundaries,
// meaning that the bitmap base is aligned to a 32 byte boundary and the scan line "stride" is a
// multiple of 32 bytes. Besides the performance loss of unaligned access, program testing on
// current Intel hardware says that unaligned access triggers a fault from the processor.
//
// NOTE: Experience on my development laptop says this isn't much faster than SSE. However
//       I'm going to assume that's just my laptop, and that maybe in the future, AVX will
//       get faster.
template <class T> void stretchblt_bilinear_avx(const rgb_bitmap_info &dbmp,const rgb_bitmap_info &sbmp) {
#if HAVE_CPU_AVX2
    // WARNING: This code assumes typical RGBA type packing where red and blue are NOT adjacent, and alpha and green are not adjacent
    nr_wfpack sx={0,0},sy={0,0},stepx,stepy;
    static vinterp_tmp<__m256i> vinterp_tmp;
    const T rbmask = (T)(dbmp.rgbinfo.r.mask+dbmp.rgbinfo.b.mask);
    const T abmask = (T)(dbmp.rgbinfo.g.mask+dbmp.rgbinfo.a.mask);
    __m256i rmask256,gmask256,bmask256,mul256;
    const size_t pixels_per_group =
        sizeof(__m256i) / sizeof(T);
    unsigned int src_bitmap_width_in_groups =
        (sbmp.width + pixels_per_group - 1) / pixels_per_group;
    unsigned char *drow;
    size_t ox,oy;
    T fshift;
    T pshift;
    T fmax;
    T mul;

    pshift = std::min(dbmp.rgbinfo.r.bwidth,std::min(dbmp.rgbinfo.g.bwidth,dbmp.rgbinfo.b.bwidth));
    fshift = (sizeof(nr_wftype) * 8) - pshift;
    fmax = 1U << pshift;

    if (sizeof(T) == 4) {
        rmask256 = _mm256_set_epi16(
            0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,
            0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF);
    }
    else {
        rmask256 = _mm256_set_epi16(
            dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask,
            dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask,
            dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask,
            dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask,dbmp.rgbinfo.r.bmask);
        gmask256 = _mm256_set_epi16(
            dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask,
            dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask,
            dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask,
            dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask,dbmp.rgbinfo.g.bmask);
        bmask256 = _mm256_set_epi16(
            dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask,
            dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask,
            dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask,
            dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask,dbmp.rgbinfo.b.bmask);
    }

    render_scale_from_sd(/*&*/stepx,dbmp.width,sbmp.width);
    render_scale_from_sd(/*&*/stepy,dbmp.height,sbmp.height);
    if (dbmp.width == 0 || src_bitmap_width_in_groups == 0) return;

    drow = dbmp.get_scanline<uint8_t>(0);
    oy = dbmp.height;
    do {
        T *s2 = sbmp.get_scanline<T>(sy.w+1);
        T *s = sbmp.get_scanline<T>(sy.w);
        T *d = (T*)drow;

        mul = (T)(sy.f >> fshift);

        {
            unsigned int m = (mul & (~1U)) << (15 - pshift); // 16-bit MMX multiply (signed bit), remove one bit to match precision
            mul256 = _mm256_set_epi16(
                m,m,m,m,m,m,m,m,
                m,m,m,m,m,m,m,m);
        }

        if (mul != 0) {
            if (stepx.w != 1 || stepx.f != 0) {
                // horizontal interpolation, vertical interpolation
                if (sizeof(T) == 4)
                    stretchblt_line_bilinear_vinterp_stage_avx_argb8(vinterp_tmp.tmp,(__m256i*)s,(__m256i*)s2,mul256,src_bitmap_width_in_groups,rmask256);
                else
                    stretchblt_line_bilinear_vinterp_stage_avx_rgb16(vinterp_tmp.tmp,(__m256i*)s,(__m256i*)s2,mul256,src_bitmap_width_in_groups,
                        rmask256,dbmp.rgbinfo.r.shift,
                        gmask256,dbmp.rgbinfo.g.shift,
                        bmask256,dbmp.rgbinfo.b.shift);

                stretchblt_line_bilinear_hinterp_stage<T>(d,(T*)vinterp_tmp.tmp,sx,stepx,dbmp.width,rbmask,abmask,fmax,fshift,pshift);
            }
            else {
                // vertical interpolation only
                if (sizeof(T) == 4)
                    stretchblt_line_bilinear_vinterp_stage_avx_argb8((__m256i*)d,(__m256i*)s,(__m256i*)s2,mul256,src_bitmap_width_in_groups,rmask256);
                else
                    stretchblt_line_bilinear_vinterp_stage_avx_rgb16((__m256i*)d,(__m256i*)s,(__m256i*)s2,mul256,src_bitmap_width_in_groups,
                        rmask256,dbmp.rgbinfo.r.shift,
                        gmask256,dbmp.rgbinfo.g.shift,
                        bmask256,dbmp.rgbinfo.b.shift);
            }
        }
        else {
            if (stepx.w != 1 || stepx.f != 0) {
                // horizontal interpolation, no vertical interpolation
                stretchblt_line_bilinear_hinterp_stage<T>(d,s,sx,stepx,dbmp.width,rbmask,abmask,fmax,fshift,pshift);
            }
            else {
                // copy the scanline 1:1 no interpolation
                memcpy(d,s,dbmp.width*sizeof(T));
            }
        }

        if ((--oy) == 0) break;
        drow += dbmp.stride;
        sy += stepy;
    } while (1);
#endif
}