Example #1
0
void static
avx2_test (void)
{
  union256i_d s1, s2;
  union256i_w u;
  short e[16];
  int i;

  s1.x = _mm256_set_epi32 (1, 2, 3, 4, 65000, 20, 30, 90);

  s2.x = _mm256_set_epi32 (88, 44, 33, 22, 11, 98, 76, -65000);

  u.x = _mm256_packs_epi32 (s1.x, s2.x);

  for (i = 0; i < 4; i++)
    {
      e[i] = int_to_short (s1.a[i]);
      e[i + 4] = int_to_short (s2.a[i]);
      e[i + 8] = int_to_short (s1.a[i + 4]);
      e[i + 12] = int_to_short (s2.a[i + 4]);
    }

  if (check_union256i_w (u, e))
    abort ();
}
Example #2
0
File: main.cpp Project: sclc/DPP
int
main(void)
{
    //_mm256_permutevar_ps
    __m256  da = _mm256_setr_ps(1,2,3,4,5,6,7,8);
    __m256i ds = _mm256_set_epi32(0,0,0,0,0,0,0,0x02);
    __m256  dc;

    printf("da: ");
    for(int i=0; i<sizeof(da)/sizeof(da.m256_f32[0]); i++)
        printf("%5.1f  ", da.m256_f32[i]);
    printf("\n");


    dc = _mm256_permutevar_ps(da, ds);

    printf("dc: ");
    for(int i=0; i<sizeof(dc)/sizeof(dc.m256_f32[0]); i++)
        printf("%5.1f  ", dc.m256_f32[i]);
    printf("\n");


    ds = _mm256_set_epi32(0,0,0,0,0,0,0,0x01);
    dc = _mm256_permutevar_ps(da, ds);

    printf("dc: ");
    for(int i=0; i<sizeof(dc)/sizeof(dc.m256_f32[0]); i++)
        printf("%5.1f  ", dc.m256_f32[i]);
    printf("\n\n");


    //_mm_permutevar_ps
    __m128  fa = _mm_setr_ps(1,2,3,4);
    __m128i fs = _mm_set_epi32(0,0,0,0x02);
    __m128  fc;

    printf("fa: ");
    for(int i=0; i<sizeof(fa)/sizeof(fa.m128_f32[0]); i++)
        printf("%5.1f  ", fa.m128_f32[i]);
    printf("\n");


    fc = _mm_permutevar_ps(fa, fs);

    printf("fc: ");
    for(int i=0; i<sizeof(fc)/sizeof(fc.m128_f32[0]); i++)
        printf("%5.1f  ", fc.m128_f32[i]);
    printf("\n");

    fs = _mm_set_epi32(0,0,0,0x01);
    fc = _mm_permutevar_ps(fa, fs);

    printf("fc: ");
    for(int i=0; i<sizeof(fc)/sizeof(fc.m128_f32[0]); i++)
        printf("%5.1f  ", fc.m128_f32[i]);
    printf("\n");

    return 0;
}
Example #3
0
__m256i inline Read8(const unsigned char* chunk, int offset) {
    __m256i ret = _mm256_set_epi32(
        ReadLE32(chunk + 0 + offset),
        ReadLE32(chunk + 64 + offset),
        ReadLE32(chunk + 128 + offset),
        ReadLE32(chunk + 192 + offset),
        ReadLE32(chunk + 256 + offset),
        ReadLE32(chunk + 320 + offset),
        ReadLE32(chunk + 384 + offset),
        ReadLE32(chunk + 448 + offset)
    );
    return _mm256_shuffle_epi8(ret, _mm256_set_epi32(0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL, 0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL));
}
// Computes part of matrix.vector v = Wu. Computes N=8 results.
// For details see PartialMatrixDotVector64 with N=8.
static void PartialMatrixDotVector8(const int8_t* wi, const double* scales,
                                    const int8_t* u, int num_in, int num_out,
                                    double* v) {
  // Register containing 16-bit ones for horizontal add with 16->32 bit
  // conversion.
  __m256i ones =
      _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
  __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1);
  // Initialize all the results to 0.
  __m256i result0 = _mm256_setzero_si256();
  // Iterate over the input (u), one registerful at a time.
  for (int j = 0; j < num_in;) {
    __m256i inputs =
        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(u + j));
    // Inputs are processed in groups of kNumInputsPerGroup, replicated
    // kNumInputGroups times.
    for (int ig = 0; ig < kNumInputGroups && j < num_in;
         ++ig, j += kNumInputsPerGroup) {
      // Replicate the low 32 bits (4 inputs) 8 times.
      __m256i rep_input =
          _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs));
      // Rotate the inputs in groups of 4, so the next 4 inputs are ready.
      inputs = _mm256_permutevar8x32_epi32(inputs, shift_id);
      __m256i weights, reps;
      // Mul-add, with horizontal add of the 4 inputs to each of the results.
      MultiplyGroup(rep_input, ones, wi, weights, reps, result0);
    }
  }
  ExtractResults(result0, shift_id, wi, scales, num_out, v);
}
Example #5
0
static void
avx2_test (void)
{
    union256i_d u, s1, s2;
    int e[8];
    int i;

    s1.x = _mm256_set_epi32 (1, 2, 3, 4, 10, 20, 30, 90000);

    s2.x = _mm256_set_epi32 (88, 44, 3, 22, 11, 98, 76, -100);

    u.x = _mm256_cmpgt_epi32 (s1.x, s2.x);

    for (i = 0; i < 8; i++)
        e[i] = (s1.a[i] > s2.a[i]) ? -1 : 0;

    if (check_union256i_d (u, e))
        abort ();
}
Example #6
0
void inline Write8(unsigned char* out, int offset, __m256i v) {
    v = _mm256_shuffle_epi8(v, _mm256_set_epi32(0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL, 0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL));
    WriteLE32(out + 0 + offset, _mm256_extract_epi32(v, 7));
    WriteLE32(out + 32 + offset, _mm256_extract_epi32(v, 6));
    WriteLE32(out + 64 + offset, _mm256_extract_epi32(v, 5));
    WriteLE32(out + 96 + offset, _mm256_extract_epi32(v, 4));
    WriteLE32(out + 128 + offset, _mm256_extract_epi32(v, 3));
    WriteLE32(out + 160 + offset, _mm256_extract_epi32(v, 2));
    WriteLE32(out + 192 + offset, _mm256_extract_epi32(v, 1));
    WriteLE32(out + 224 + offset, _mm256_extract_epi32(v, 0));
}
static void FlowInterExtra_AVX2(
        uint8_t *pdst8, int dst_pitch,
        const uint8_t *prefB8, const uint8_t *prefF8, int ref_pitch,
        const int16_t *VXFullB, const int16_t *VXFullF,
        const int16_t *VYFullB, const int16_t *VYFullF,
        const uint8_t *MaskB, const uint8_t *MaskF, int VPitch,
        int width, int height,
        int time256, int nPel,
        const int16_t *VXFullBB, const int16_t *VXFullFF,
        const int16_t *VYFullBB, const int16_t *VYFullFF) {

    const PixelType *prefB = (const PixelType *)prefB8;
    const PixelType *prefF = (const PixelType *)prefF8;
    PixelType *pdst = (PixelType *)pdst8;

    ref_pitch /= sizeof(PixelType);
    dst_pitch /= sizeof(PixelType);

    int nPelLog = ilog2(nPel);

    const __m256i dwords_time256 = _mm256_set1_epi32(time256);
    const __m256i dwords_256_time256 = _mm256_set1_epi32(256 - time256);
    const __m256i dwords_ref_pitch = _mm256_set1_epi32(ref_pitch);
    const __m256i dwords_hoffsets = _mm256_set_epi32(7 << nPelLog, 6 << nPelLog, 5 << nPelLog, 4 << nPelLog, 3 << nPelLog, 2 << nPelLog, 1 << nPelLog, 0);

    const int pixels_per_iteration = 8;
    const int width_avx2 = width & ~(pixels_per_iteration - 1);

    for (int h = 0; h < height; h++) {
        for (int w = 0; w < width_avx2; w += pixels_per_iteration)
            FlowInterExtra_8px_AVX2(w, pdst, prefB, prefF, VXFullB, VXFullF, VYFullB, VYFullF, MaskB, MaskF, nPelLog, VXFullBB, VXFullFF, VYFullBB, VYFullFF, dwords_time256, dwords_256_time256, dwords_ref_pitch, dwords_hoffsets);

        if (width_avx2 < width)
            FlowInterExtra_8px_AVX2(width - pixels_per_iteration, pdst, prefB, prefF, VXFullB, VXFullF, VYFullB, VYFullF, MaskB, MaskF, nPelLog, VXFullBB, VXFullFF, VYFullBB, VYFullFF, dwords_time256, dwords_256_time256, dwords_ref_pitch, dwords_hoffsets);

        pdst += dst_pitch;
        prefB += ref_pitch << nPelLog;
        prefF += ref_pitch << nPelLog;
        VXFullB += VPitch;
        VYFullB += VPitch;
        VXFullF += VPitch;
        VYFullF += VPitch;
        MaskB += VPitch;
        MaskF += VPitch;
        VXFullBB += VPitch;
        VYFullBB += VPitch;
        VXFullFF += VPitch;
        VYFullFF += VPitch;
    }
}
Example #8
0
// Compare rank with all values currently in the queue.  Returns -1 if the value already exists
// or is larger than all values.
// Otherwise, returns the index of the register in which the value should be inserted.
// Mask is replicated to both lanes, so it can be used for both value and rank lane.
int PriorityQueue_AVX2::compare(__m256i mrank, int &field, __m256i &gtmask)
{
    static const __m256i eq4mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
    __m256i eq, eq4;
    int reg, mask;

    // Because items are sorted in ascending order within each (double) register, the mask after GT
    // comparison must be of the form 000...1111, which is one less than a power of two.
    {
        __m256i r0_7 = _mm256_permute2x128_si256(_rv[1], _rv[0], 0x20);		// [0 .. 7]
        gtmask = _mm256_cmpgt_epi32(r0_7, mrank);
        mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask));
        eq = _mm256_cmpeq_epi32(r0_7, mrank);
        _ASSERTE(((mask + 1) & mask) == 0);
        reg = 1;
    }

    if (!mask) {
        __m256i r8_15 = _mm256_permute2x128_si256(_rv[3], _rv[2], 0x20);	// [8 .. 15]
        gtmask = _mm256_cmpgt_epi32(r8_15, mrank);
        mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask));
        eq = _mm256_or_si256(eq, _mm256_cmpeq_epi32(r8_15, mrank));
        _ASSERTE(((mask + 1) & mask) == 0);
        reg = 3;
    }

    if (!mask) {
        gtmask = _mm256_cmpgt_epi32(_rv[4], mrank);							// [16 .. 19]; don't care about value
        eq4 = _mm256_and_si256(eq4mask, _mm256_cmpeq_epi32(mrank, _rv[4])); // .. ditto
        mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask)) & 0xF;       // ignore comparison with values
        eq = _mm256_or_si256(eq, eq4);
        _ASSERTE(((mask + 1) & mask) == 0);
        reg = 4;
    }

    if (_mm256_movemask_ps(_mm256_castsi256_ps(eq)) != 0)
        mask = 0;
    if (!mask)
        return -1;

    // Adjust register according to mask (higher 128-bits i double register: one register lower)
    // There is no "previous" register to test against for equality if we need to insert in the
    // very first register.  Also duplicate the same mask to both lanes.

    if (mask > 0xF) {
        mask >>= 4;
        --reg;
        gtmask = _mm256_permute2x128_si256(gtmask, gtmask, 0x11);           // replicate high lane to both
    }
Example #9
0
/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
static void
shuffle4_avx2(uint8_t* const dest, const uint8_t* const src,
  const size_t vectorizable_elements, const size_t total_elements)
{
  static const size_t bytesoftype = 4;
  size_t i;
  int j;
  __m256i ymm0[4], ymm1[4];

  /* Create the shuffle mask.
     NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
     most to least significant (i.e., their order is reversed when compared to
     loading the mask from an array). */
  const __m256i mask = _mm256_set_epi32(
    0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00);

  for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) {
    /* Fetch 32 elements (128 bytes) then transpose bytes and words. */
    for (j = 0; j < 4; j++) {
      ymm0[j] = _mm256_loadu_si256((__m256i*)(src + (i * bytesoftype) + (j * sizeof(__m256i))));
      ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0xd8);
      ymm0[j] = _mm256_shuffle_epi32(ymm0[j], 0x8d);
      ymm0[j] = _mm256_unpacklo_epi8(ymm1[j], ymm0[j]);
      ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0x04e);
      ymm0[j] = _mm256_unpacklo_epi16(ymm0[j], ymm1[j]);
    }
    /* Transpose double words */
    for (j = 0; j < 2; j++) {
      ymm1[j*2] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]);
      ymm1[j*2+1] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]);
    }
    /* Transpose quad words */
    for (j = 0; j < 2; j++) {
      ymm0[j*2] = _mm256_unpacklo_epi64(ymm1[j], ymm1[j+2]);
      ymm0[j*2+1] = _mm256_unpackhi_epi64(ymm1[j], ymm1[j+2]);
    }
    for (j = 0; j < 4; j++) {
      ymm0[j] = _mm256_permutevar8x32_epi32(ymm0[j], mask);
    }
    /* Store the result vectors */
    uint8_t* const dest_for_ith_element = dest + i;
    for (j = 0; j < 4; j++) {
      _mm256_storeu_si256((__m256i*)(dest_for_ith_element + (j * total_elements)), ymm0[j]);
    }
  }
}
Example #10
0
static inline void do_encode_12bytes(const char (*alphabet)[2], char *out, __m256i chunk)
{
    const __m256i shufflemask = _mm256_set_epi8(
        -1, 9, 10, 11,
        -1, 9, 10, 11,
        -1, 6, 7, 8,
        -1, 6, 7, 8,
        -1, 3, 4, 5,
        -1, 3, 4, 5,
        -1, 0, 1, 2,
        -1, 0, 1, 2
    );
    const __m256i shifts = _mm256_set_epi32(0, 12, 0, 12, 0, 12, 0, 12);
    const __m256i masks = _mm256_set1_epi32(4095);

    // convert from big endian and rearrange the bytes
    chunk = _mm256_shuffle_epi8(chunk, shufflemask);
    chunk = _mm256_srlv_epi32(chunk, shifts);
    chunk = _mm256_and_si256(chunk, masks);

    // write the two halves to memory
    do_encode_6bytes(alphabet, out + 0, _mm256_extracti128_si256(chunk, 0));
    do_encode_6bytes(alphabet, out + 8, _mm256_extracti128_si256(chunk, 1));
}
double bst_compute_129_m256_maskstore_root_aligned( void*_bst_obj, double* p, double* q, size_t nn ) {
    segments_t* mem = (segments_t*) _bst_obj;
    int n, i, r, l_end, j, l_end_pre;
    double t, e_tmp;
    double* e = mem->e, *w = mem->w;
    int* root = mem->r;
    __m256d v_tmp;
    __m256d v00, v01, v02, v03;
    __m256d v10, v11, v12, v13;
    __m256d v20, v21, v22, v23;
    __m256d v30, v31, v32, v33;
    __m256i v_cur_roots;
    __m256 v_rootmask0, v_rootmask1;
    // initialization
    // mem->n = nn;
    n = nn; // subtractions with n potentially negative. say hello to all the bugs

    int idx1, idx1_root;
    int idx2;
    int idx3, idx3_root;
    int pad_root, pad, pad_r;
    
    idx1      = ((int) mem->e_sz) - 1;
    idx1_root = ((int) mem->r_sz);
    // the conventio is that iteration i, idx1 points to the first element of line i+1
    e[idx1++] = q[n];
    
    // pad contains the padding for row i+1
    // for row n it's always 3
    pad = 3;
    pad_root = 7;
    for (i = n-1; i >= 0; --i) {
        idx1      -= 2*(n-i)+1 + pad;
        idx1_root -= 2*(n-i)+1 + pad_root;
        idx2       = idx1 + 1;
        e[idx1]    = q[i];
        w[idx1]    = q[i];
        for (j = i+1; j < n+1; ++j,++idx2) {
            e[idx2] = INFINITY;
            w[idx2] = w[idx2-1] + p[j-1] + q[j];
        }
        idx2     += pad; // padding of line i+1
        // idx2 now points to the first element of the next line

        idx3      = idx1;
        idx3_root = idx1_root;
        pad_r     = pad;
        for (r = i; r < n; ++r) {
            pad_r     = (pad_r+1)&3; // padding of line r+1
            idx1      = idx3;
            idx1_root = idx3_root;
            l_end     = idx2 + (n-r);
            // l_end points to the first entry after the current row
            e_tmp     = e[idx1++];
            idx1_root++;
            // calculate until a multiple of 8 doubles is left
            // 8 = 4 * 2 128-bit vectors
            l_end_pre = idx2 + ((n-r)&15);
            for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) {
                t = e_tmp + e[idx2] + w[idx1];
                if (t < e[idx1]) {
                    e[idx1] = t;
                    root[idx1_root] = r;
                }
                idx1++;
                idx1_root++;
            }
            
            v_tmp = _mm256_set_pd( e_tmp, e_tmp, e_tmp, e_tmp );
            // execute the shit for 4 vectors of size 2
            v_cur_roots = _mm256_set_epi32(r, r, r, r, r, r, r, r);
            for( ; idx2 < l_end; idx2 += 16 ) {
                v01 = _mm256_load_pd( &w[idx1   ] );
                v11 = _mm256_load_pd( &w[idx1+ 4] );
                v21 = _mm256_load_pd( &w[idx1+ 8] );
                v31 = _mm256_load_pd( &w[idx1+12] );

                v00 = _mm256_load_pd( &e[idx2   ] );
                v01 = _mm256_add_pd( v01, v_tmp ); 
                v10 = _mm256_load_pd( &e[idx2+ 4] );
                v11 = _mm256_add_pd( v11, v_tmp );
                v20 = _mm256_load_pd( &e[idx2+ 8] );
                v21 = _mm256_add_pd( v21, v_tmp );
                v30 = _mm256_load_pd( &e[idx2+12] );
                v31 = _mm256_add_pd( v31, v_tmp );

                v01 = _mm256_add_pd( v01, v00 );
                v03 = _mm256_load_pd( &e[idx1   ] );
                v11 = _mm256_add_pd( v11, v10 );
                v13 = _mm256_load_pd( &e[idx1+ 4] );
                v21 = _mm256_add_pd( v21, v20 );
                v23 = _mm256_load_pd( &e[idx1+ 8] );
                v31 = _mm256_add_pd( v31, v30 );
                v33 = _mm256_load_pd( &e[idx1+12] );

                v02 = _mm256_cmp_pd( v01, v03, _CMP_LT_OQ );
                v12 = _mm256_cmp_pd( v11, v13, _CMP_LT_OQ );
                v22 = _mm256_cmp_pd( v21, v23, _CMP_LT_OQ );
                v32 = _mm256_cmp_pd( v31, v33, _CMP_LT_OQ );

                _mm256_maskstore_pd( &e[idx1   ],
                        _mm256_castpd_si256( v02 ), v01 );
                _mm256_maskstore_pd( &e[idx1+ 4],
                        _mm256_castpd_si256( v12 ), v11 );

                v_rootmask0 = _mm256_insertf128_ps(
                        _mm256_castps128_ps256(
                            _mm256_cvtpd_ps(v02)),
                            _mm256_cvtpd_ps(v12) , 1
                    );

                _mm256_maskstore_pd( &e[idx1+ 8],
                        _mm256_castpd_si256( v22 ), v21 );
                _mm256_maskstore_pd( &e[idx1+12], 
                        _mm256_castpd_si256( v32 ), v31 );
                v_rootmask1 = _mm256_insertf128_ps(
                        _mm256_castps128_ps256(
                            _mm256_cvtpd_ps(v22)),
                            _mm256_cvtpd_ps(v32) , 1
                    );
                
                _mm256_maskstore_ps( &root[idx1_root    ],
                        _mm256_castps_si256( v_rootmask0 ),
                        _mm256_castsi256_ps( v_cur_roots ) );
                _mm256_maskstore_ps( &root[idx1_root + 8],
                        _mm256_castps_si256( v_rootmask1 ),
                        _mm256_castsi256_ps( v_cur_roots ) );
                idx1      += 16;
                idx1_root += 16;
            }
            idx2 += pad_r;
            idx3++;
            idx3_root++;
        }
        pad      = (pad     -1)&3;
        pad_root = (pad_root-1)&7;
    }
    // the index of the last item of the first row is ((n/4)+1)*4-1, due to the padding
    // if n is even, the total number of entries in the first
    // row of the table is odd, so we need padding
    return e[ ((n/4)+1)*4 - 1 ];
}
    parasail_result_t *result = parasail_result_new();
#endif
#endif
    int32_t i = 0;
    int32_t j = 0;
    int32_t end_query = 0;
    int32_t end_ref = 0;
    int32_t score = NEG_INF;
    __m256i vNegInf = _mm256_set1_epi32(NEG_INF);
    __m256i vOpen = _mm256_set1_epi32(open);
    __m256i vGap  = _mm256_set1_epi32(gap);
    __m256i vOne = _mm256_set1_epi32(1);
    __m256i vN = _mm256_set1_epi32(N);
    __m256i vGapN = _mm256_set1_epi32(gap*N);
    __m256i vNegOne = _mm256_set1_epi32(-1);
    __m256i vI = _mm256_set_epi32(0,1,2,3,4,5,6,7);
    __m256i vJreset = _mm256_set_epi32(0,-1,-2,-3,-4,-5,-6,-7);
    __m256i vMax = vNegInf;
    __m256i vILimit = _mm256_set1_epi32(s1Len);
    __m256i vILimit1 = _mm256_sub_epi32(vILimit, vOne);
    __m256i vJLimit = _mm256_set1_epi32(s2Len);
    __m256i vJLimit1 = _mm256_sub_epi32(vJLimit, vOne);
    __m256i vIBoundary = _mm256_set_epi32(
            -open-0*gap,
            -open-1*gap,
            -open-2*gap,
            -open-3*gap,
            -open-4*gap,
            -open-5*gap,
            -open-6*gap,
            -open-7*gap
Example #13
0
foo (int x1, int x2, int x3, int x4,
     int x5, int x6, int x7, int x8)
{
    return _mm256_set_epi32 (x1, x2, x3, x4, x5, x6, x7, x8);
}
Example #14
0
 INLINE avxi( StepTy   ) : m256(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
#endif
#endif
    int32_t i = 0;
    int32_t j = 0;
    int32_t end_query = 0;
    int32_t end_ref = 0;
    int32_t score = NEG_INF;
    __m256i vNegInf = _mm256_set1_epi32(NEG_INF);
    __m256i vNegInf0 = _mm256_srli_si256_rpl(vNegInf, 4); /* shift in a 0 */
    __m256i vOpen = _mm256_set1_epi32(open);
    __m256i vGap  = _mm256_set1_epi32(gap);
    __m256i vZero = _mm256_set1_epi32(0);
    __m256i vOne = _mm256_set1_epi32(1);
    __m256i vN = _mm256_set1_epi32(N);
    __m256i vNegOne = _mm256_set1_epi32(-1);
    __m256i vI = _mm256_set_epi32(0,1,2,3,4,5,6,7);
    __m256i vJreset = _mm256_set_epi32(0,-1,-2,-3,-4,-5,-6,-7);
    __m256i vMax = vNegInf;
    __m256i vEndI = vNegInf;
    __m256i vEndJ = vNegInf;
    __m256i vILimit = _mm256_set1_epi32(s1Len);
    __m256i vJLimit = _mm256_set1_epi32(s2Len);
    

    /* convert _s1 from char to int in range 0-23 */
    for (i=0; i<s1Len; ++i) {
        s1[i] = matrix->mapper[(unsigned char)_s1[i]];
    }
    /* pad back of s1 with dummy values */
    for (i=s1Len; i<s1Len_PAD; ++i) {
        s1[i] = 0; /* point to first matrix row because we don't care */
void Decoder::ADMMDecoder_deg_6_7_2_3_6()
{
	int maxIter          = maxIteration;
	float mu             = 5.5f; 
	float tableau[12]    = { 0.0f };

    if ((mBlocklength == 576) && (mNChecks == 288))
    {
     	mu          = 3.37309f;//penalty
        tableau[2]  = 0.00001f;
	tableau[3]  = 2.00928f;
	tableau[6]  = 4.69438f;

    }
    else if((mBlocklength == 2304) && (mNChecks == 1152) )
    {
    	mu          = 3.81398683f;//penalty
        tableau[2]  = 0.29669288f; 
	tableau[3]  = 0.46964023f;
	tableau[6]  = 3.19548154f;
    }
    else
    {
    	mu          = 5.5;//penalty
        tableau[2]  = 0.8f;
	tableau[3]  = 0.8f;
	tableau[6]  = 0.8f;
    }

    const float rho      = 1.9f;    //over relaxation parameter;
    const float un_m_rho = 1.0 - rho;
    const auto  _rho      = _mm256_set1_ps(      rho );
    const auto  _un_m_rho = _mm256_set1_ps( un_m_rho );
    float tableaX[12];

    //
    // ON PRECALCULE LES CONSTANTES
    //
	#pragma  unroll
    for (int i = 0; i < 7; i++)
    {
        tableaX[i] = tableau[ i ] / mu;
    }
	const auto t_mu    = _mm256_set1_ps ( mu );

	const auto t2_amu  = _mm256_set1_ps (        tableau[ 2 ] / mu   );
	const auto t3_amu  = _mm256_set1_ps (        tableau[ 3 ] / mu   );
	const auto t6_amu  = _mm256_set1_ps (        tableau[ 6 ] / mu   );

	const auto t2_2amu = _mm256_set1_ps ( 2.0f * tableau[ 2 ] / mu );
	const auto t3_2amu = _mm256_set1_ps ( 2.0f * tableau[ 3 ] / mu );
	const auto t6_2amu = _mm256_set1_ps ( 2.0f * tableau[ 6 ] / mu );

	const auto t2_deg  = _mm256_set1_ps ( 2.0f );
	const auto t3_deg  = _mm256_set1_ps ( 3.0f );
	const auto t6_deg  = _mm256_set1_ps ( 6.0f );

	const auto zero    = _mm256_set1_ps ( 0.0f );
	const auto un      = _mm256_set1_ps ( 1.0f );
    const __m256 a     = _mm256_set1_ps ( 0.0f );
    const __m256 b     = _mm256_set1_ps ( 0.5f );

    //////////////////////////////////////////////////////////////////////////////////////
	#pragma  unroll
	for( int j = 0; j < _mPCheckMapSize; j+=8 )
    {
	_mm256_store_ps(&Lambda  [j],         a);
        _mm256_store_ps(&zReplica[j],         b);
        _mm256_store_ps(&latestProjVector[j], b);
    }
    //////////////////////////////////////////////////////////////////////////////////////

	for(int i = 0; i < maxIter; i++)
	{
        int ptr    = 0;
		mIteration = i + 1;

    	//
    	// MEASURE OF THE VN EXECUTION TIME
    	//
		#ifdef PROFILE_ON
				const auto start = timer();
		#endif

        //
		// VN processing kernel
		//
		#pragma  unroll
		for (int j = 0; j < _mBlocklength; j++)
        {
            const int degVn = VariableDegree[j];
            float M[8] __attribute__((aligned(64)));

            if( degVn == 2 ){
#if 1
            	const int dVN = 2;
            	for(int qq = 0; qq < 8; qq++) 
		{
    				M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);      ptr   += 1;
    				#pragma  unroll
    				for(int k = 1; k < dVN; k++) 
				{
    					M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr   += 1;
    				}
    		}
    		const auto m      = _mm256_loadu_ps( M );
    		const auto llr    = _mm256_loadu_ps( &_LogLikelihoodRatio[j] );
    		const auto t1     = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu));
    		const auto xx     = _mm256_div_ps(_mm256_sub_ps(t1, t2_amu), _mm256_sub_ps(t2_deg, t2_2amu));
    		const auto vMin   = _mm256_max_ps(_mm256_min_ps(xx, un) , zero);
    		_mm256_storeu_ps(&OutputFromDecoder[j], vMin);
    		j += 7;
#else
            	const int degVN = 2;
                float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);
		#pragma unroll
		for(int k = 1; k < degVN; k++)
			temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]);
		ptr  += degVN;
	        const float _amu_    = tableaX[ degVN ];
	        const float _2_amu_  = _amu_+ _amu_;
	        const float llr  = _LogLikelihoodRatio[j];
	        const float t    = temp - llr / mu;
	        const float xx   = (t  -  _amu_)/(degVn - _2_amu_);
	        const float vMax = std::min(xx,   1.0f);
	        const float vMin = std::max(vMax, 0.0f);
		OutputFromDecoder[j] = vMin;
#endif
            }else if( degVn == 3 ){
#if 1
            	const int dVN = 3;
            	for(int qq = 0; qq < 8; qq++) 
		{
    			M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);      ptr   += 1;
    			#pragma  unroll
    			for(int k = 1; k < dVN; k++) 
			{
    				M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr   += 1;
    			}
    		}
    		const auto m      = _mm256_loadu_ps( M );
    		const auto llr    = _mm256_loadu_ps( &_LogLikelihoodRatio[j] );
    		const auto t1     = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu));
    		const auto xx     = _mm256_div_ps(_mm256_sub_ps(t1, t3_amu), _mm256_sub_ps(t3_deg, t3_2amu));
    		const auto vMin   = _mm256_max_ps(_mm256_min_ps(xx, un) , zero);
    		_mm256_storeu_ps(&OutputFromDecoder[j], vMin);
    		j += 7;
#else
    		const int degVN = 3;
                float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);
		#pragma unroll
		for(int k = 1; k < degVN; k++)
			temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]);
		ptr  += degVN;
	        const float _amu_    = tableaX[ degVN ];
	        const float _2_amu_  = _amu_+ _amu_;
	        const float llr  = _LogLikelihoodRatio[j];
	        const float t    = temp - llr / mu;
	        const float xx   = (t  -  _amu_)/(degVn - _2_amu_);
	        const float vMax = std::min(xx,   1.0f);
	        const float vMin = std::max(vMax, 0.0f);
		OutputFromDecoder[j] = vMin;
#endif
		}else if( degVn == 6 ){
#if 1
            	const int dVN = 6;
            	for(int qq = 0; qq < 8; qq++) 
		{
    			M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);      ptr   += 1;
    			#pragma  unroll
    			for(int k = 1; k < dVN; k++) 
			{
    				M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr   += 1;
    			}
    		}
    		const auto m      = _mm256_loadu_ps( M );
    		const auto llr    = _mm256_loadu_ps( &_LogLikelihoodRatio[j] );
    		const auto t1     = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu));
    		const auto xx     = _mm256_div_ps(_mm256_sub_ps(t1, t6_amu), _mm256_sub_ps(t6_deg, t6_2amu));
    		const auto vMin   = _mm256_max_ps(_mm256_min_ps(xx, un) , zero);
    		_mm256_storeu_ps(&OutputFromDecoder[j], vMin);
    		j += 7;
#else
    		const int degVN = 6;
                float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);
		#pragma unroll
		for(int k = 1; k < degVN; k++)
			temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]);
		ptr  += degVN;
	        const float _amu_    = tableaX[ degVN ];
	        const float _2_amu_  = _amu_+ _amu_;
	        const float llr  = _LogLikelihoodRatio[j];
	        const float t    = temp - llr / mu;
	        const float xx   = (t  -  _amu_)/(degVn - _2_amu_);
	        const float vMax = std::min(xx,   1.0f);
	        const float vMin = std::max(vMax, 0.0f);
		OutputFromDecoder[j] = vMin;
#endif
            }
        }

    	//
    	// MEASURE OF THE VN EXECUTION TIME
    	//
		#ifdef PROFILE_ON
				t_vn   += (timer() - start);
		#endif

		//
		// CN processing kernel
		//
	int CumSumCheckDegree = 0; // cumulative position of currect edge in factor graph
        int allVerified       = 0;
	float vector_before_proj[8] __attribute__((aligned(64)));

        const auto zero    = _mm256_set1_ps ( 0.0f    );
        const auto mask_6  = _mm256_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
        const auto mask_7  = _mm256_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
        const auto  dot5   = _mm256_set1_ps(     0.5f );

    	//
    	// MEASURE OF THE CN EXECUTION TIME
    	//
		#ifdef PROFILE_ON
				const auto starT = timer();
		#endif

    	const auto seuilProj = _mm256_set1_ps( 1e-5f );
        for(int j = 0; j < _mNChecks; j++)
		{
            if( CheckDegree[j] == 6 ){
            	const int  cDeg6       = 0x3F;
                const auto offsets    = _mm256_loadu_si256  ((const __m256i*)&t_col1  [CumSumCheckDegree]);
                const auto xpred      = _mm256_mask_i32gather_ps (zero, OutputFromDecoder, offsets, _mm256_castsi256_ps(mask_6), 4);
    		const auto synd       = _mm256_cmp_ps( xpred, dot5,   _CMP_GT_OS );
    		int test              = (_mm256_movemask_ps( synd ) & cDeg6);  // deg 6
    		const auto syndrom    = _mm_popcnt_u32( test );
    		const auto _Replica   = _mm256_loadu_ps( &zReplica[CumSumCheckDegree]);
    		const auto _ambda     = _mm256_loadu_ps( &Lambda  [CumSumCheckDegree]);
    		const auto v1         = _mm256_mul_ps  (xpred,      _rho );
    		const auto v2         = _mm256_mul_ps  ( _Replica, _un_m_rho );
    		const auto v3         = _mm256_add_ps  ( v1, v2 );
    		const auto vect_proj  = _mm256_sub_ps  ( v3, _ambda );

                //
                // ON REALISE LA PROJECTION !!!
                //
                allVerified       += ( syndrom & 0x01 );

    	    	//
    	    	// MEASURE OF THE PROJECTION EXECUTION TIME
    	    	//
                #ifdef PROFILE_ON
					const auto START = timer();
		#endif
    		const auto latest    = _mm256_loadu_ps(&latestProjVector[CumSumCheckDegree]);
    		const auto different = _mm256_sub_ps ( vect_proj, latest );
    		const auto maskAbsol = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
    		const auto absolute  = _mm256_and_ps ( different, maskAbsol );
    	        const auto despass   = _mm256_cmp_ps( absolute, seuilProj, _CMP_GT_OS );
    	        int skip = (_mm256_movemask_ps( despass ) & cDeg6) == 0x00; // degree 6

    	        if( skip == false )
    	        {
    	        	const auto _ztemp  = mp.projection_deg6( vect_proj );
    	    		const auto _ztemp1 = _mm256_sub_ps(_ztemp,    xpred );
    	    		const auto _ztemp2 = _mm256_sub_ps(_ztemp, _Replica );
	    	    	const auto _ztemp3 = _mm256_mul_ps(_ztemp1, _rho);
	    	    	const auto _ztemp4 = _mm256_mul_ps(_ztemp2, _un_m_rho);
    			const auto nLambda = _mm256_add_ps( _ambda,  _ztemp3 );
    			const auto mLambda = _mm256_add_ps( nLambda, _ztemp4 );
    	    		_mm256_maskstore_ps(&  Lambda[CumSumCheckDegree],         mask_6,   mLambda);
    	    		_mm256_maskstore_ps(&zReplica[CumSumCheckDegree],         mask_6,    _ztemp);
    	        }
	    	_mm256_maskstore_ps(&latestProjVector[CumSumCheckDegree], mask_6, vect_proj);

    	    	//
    	    	// MEASURE OF THE PROJECTION EXECUTION TIME
    	    	//
    	        #ifdef PROFILE_ON
					t_pj   += (timer() - START);
		#endif
                CumSumCheckDegree += 6;

            }else if( CheckDegree[j] == 7 )
	    {
            	const int  cDeg7       = 0x7F;
                const auto offsets    = _mm256_loadu_si256  ((const __m256i*)&t_col1  [CumSumCheckDegree]);
                const auto xpred      = _mm256_mask_i32gather_ps (zero, OutputFromDecoder, offsets, _mm256_castsi256_ps(mask_7), 4);
    		const auto synd       = _mm256_cmp_ps( xpred, dot5,   _CMP_GT_OS );
    		const int  test       = (_mm256_movemask_ps( synd ) & cDeg7); // deg 7
    		const auto syndrom    = _mm_popcnt_u32( test );
    		const auto _Replica   = _mm256_loadu_ps( &zReplica[CumSumCheckDegree]);
    		const auto _ambda     = _mm256_loadu_ps( &Lambda  [CumSumCheckDegree]);
    		const auto v1         = _mm256_mul_ps  ( xpred,    _rho );
    		const auto v2         = _mm256_mul_ps  ( _Replica, _un_m_rho );
    		const auto v3         = _mm256_add_ps  ( v1, v2 );
    		const auto vect_proj  = _mm256_sub_ps  ( v3, _ambda );

                //
                // ON REALISE LA PROJECTION !!!
                //
                allVerified         += ( syndrom & 0x01 );

    	    	//
    	    	// MEASURE OF THE PROJECTION EXECUTION TIME
    	    	//
                #ifdef PROFILE_ON
					const auto START = timer();
		#endif
    		const auto latest    = _mm256_loadu_ps(&latestProjVector[CumSumCheckDegree]);
    		const auto different = _mm256_sub_ps ( vect_proj, latest );
    		const auto maskAbsol = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
    		const auto absolute  = _mm256_and_ps ( different, maskAbsol );
    	        const auto despass   = _mm256_cmp_ps( absolute, seuilProj, _CMP_GT_OS );
    	        int skip = (_mm256_movemask_ps( despass ) & cDeg7) == 0x00; // degree 7

    	        if( skip == false )
    	        {
			const auto _ztemp  = mp.projection_deg7( vect_proj );
    	    		const auto _ztemp1 = _mm256_sub_ps(_ztemp,    xpred );
    	    		const auto _ztemp2 = _mm256_sub_ps(_ztemp, _Replica );
    	    		const auto _ztemp3 = _mm256_mul_ps(_ztemp1, _rho);
    	    		const auto _ztemp4 = _mm256_mul_ps(_ztemp2, _un_m_rho);
    			const auto nLambda = _mm256_add_ps( _ambda,  _ztemp3 );
    			const auto mLambda = _mm256_add_ps( nLambda, _ztemp4 );
    	    		_mm256_maskstore_ps(&  Lambda        [CumSumCheckDegree], mask_7,   mLambda);
    	    		_mm256_maskstore_ps(&zReplica        [CumSumCheckDegree], mask_7,    _ztemp);
    	        }
	    	_mm256_maskstore_ps(&latestProjVector[CumSumCheckDegree], mask_7, vect_proj);

    	    	//
    	    	// MEASURE OF THE PROJECTION EXECUTION TIME
    	    	//
    	        #ifdef PROFILE_ON
							t_pj   += (timer() - START);
		#endif

                CumSumCheckDegree += 7;

            }else{
                exit( 0 );
            }
        }

    	//
    	// MEASURE OF THE CN LOOP EXECUTION TIME
    	//
        #ifdef PROFILE_ON
				t_cn   += (timer() - starT);
	#endif
	#ifdef PROFILE_ON
	t_ex += 1;
		//FILE *ft=fopen("time.txt","a");
		//fprintf(ft,"%d \n", t_cn/t_ex);
		//fprintf(ft,"%d %d %d \n", t_cn, t_vn, t_pj);
		//fclose(ft);
	#endif
		if(allVerified == 0)
		{
			mAlgorithmConverge = true;
			mValidCodeword     = true;
			break;
		}
	}

	//
	// MEASURE OF THE NUMBER OF EXECUTION
	//
//	#ifdef PROFILE_ON
//		t_ex += 1;
//	#endif

}
Example #17
0
   _mm_set_epi32(0x00200000, 0x00000000, 0x00000000, 0x00000000),
   _mm_set_epi32(0x00400000, 0x00000000, 0x00000000, 0x00000000),
   _mm_set_epi32(0x00800000, 0x00000000, 0x00000000, 0x00000000),
   _mm_set_epi32(0x01000000, 0x00000000, 0x00000000, 0x00000000),
   _mm_set_epi32(0x02000000, 0x00000000, 0x00000000, 0x00000000),
   _mm_set_epi32(0x04000000, 0x00000000, 0x00000000, 0x00000000),
   _mm_set_epi32(0x08000000, 0x00000000, 0x00000000, 0x00000000),
   _mm_set_epi32(0x10000000, 0x00000000, 0x00000000, 0x00000000),
   _mm_set_epi32(0x20000000, 0x00000000, 0x00000000, 0x00000000),
   _mm_set_epi32(0x40000000, 0x00000000, 0x00000000, 0x00000000),
   _mm_set_epi32(0x80000000, 0x00000000, 0x00000000, 0x00000000)
};

#ifdef AVX2
__m256i sseMasks256[256] = {
   _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001),
   _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000002),
   _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000004),
   _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000008),
   _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000010),
   _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000020),
   _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000040),
   _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000080),
   _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000100),
   _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000200),
   _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000400),
   _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000800),
   _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00001000),
   _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00002000),
   _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00004000),
   _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00008000),
    void run_softmax_int32_float_work_item_latency(nn_workload_item *const work_item)
    {
        nn_workload_data_t *input_view = work_item->input[0]->output;
        const auto &arguments = work_item->arguments.forward_softmax_fixedpoint;

        const auto input_width = input_view->parent->lengths.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p];
        const auto output_width = work_item->output->view_end.t[NN_DATA_COORD_x] - work_item->output->view_begin.t[NN_DATA_COORD_x] + 1;

        const auto num_full_blocks = output_width / C_data_stride;
        const auto partial_block_size = (output_width / C_simd_width) % C_max_acc;
        const auto subsimd_block_size = output_width % C_simd_width;

        const auto output_view_start = work_item->output->view_begin.t[NN_DATA_COORD_x];

        const auto input_view_start = input_view->view_begin.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p];

        const auto out_fraction = arguments.input_fraction;

        float * input_f = (float*)_mm_malloc(input_width * sizeof(float), 64);

        auto input_buffer = &static_cast<int32_t*>(input_view->parent->data_buffer)[input_view_start];

        auto shift = out_fraction;
        if (shift > 0)
        {
            for (uint32_t i = 0; i < input_width; i++)
                input_f[i] = (float)(input_buffer[i]) / (1 << shift);
        }
        else if (shift < 0)
        {
            for (uint32_t i = 0; i < input_width; i++)
                input_f[i] = (float)(input_buffer[i]) * (1 << -shift);
        }
        else
        {
            for (uint32_t i = 0; i < input_width; i++)
                input_f[i] = (float)(input_buffer[i]);
        }

        __m256 acc_sum = _mm256_setzero_ps();
        float subsimd_sum = 0.0f;
        {
            auto input_buffer = input_f;
            auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start];

            for (auto block = 0u; block < num_full_blocks; ++block)
            {
                // Run computation.
                softmax_compute_block<C_max_acc>(input_buffer, output_buffer, acc_sum);
            }

            switch (partial_block_size)
            {
            case  0: break;
            case  1: softmax_compute_block< 1>(input_buffer, output_buffer, acc_sum); break;
            case  2: softmax_compute_block< 2>(input_buffer, output_buffer, acc_sum); break;
            case  3: softmax_compute_block< 3>(input_buffer, output_buffer, acc_sum); break;
            case  4: softmax_compute_block< 4>(input_buffer, output_buffer, acc_sum); break;
            case  5: softmax_compute_block< 5>(input_buffer, output_buffer, acc_sum); break;
            case  6: softmax_compute_block< 6>(input_buffer, output_buffer, acc_sum); break;
            case  7: softmax_compute_block< 7>(input_buffer, output_buffer, acc_sum); break;
            case  8: softmax_compute_block< 8>(input_buffer, output_buffer, acc_sum); break;
            case  9: softmax_compute_block< 9>(input_buffer, output_buffer, acc_sum); break;
            case 10: softmax_compute_block<10>(input_buffer, output_buffer, acc_sum); break;
            case 11: softmax_compute_block<11>(input_buffer, output_buffer, acc_sum); break;
            case 12: softmax_compute_block<12>(input_buffer, output_buffer, acc_sum); break;
            case 13: softmax_compute_block<13>(input_buffer, output_buffer, acc_sum); break;
            case 14: softmax_compute_block<14>(input_buffer, output_buffer, acc_sum); break;
            default: NN_UNREACHABLE_CODE;
            }

            switch (subsimd_block_size)
            {
            case 0: break;
            case 1: softmax_compute_subsimd<1>(input_buffer, output_buffer, subsimd_sum); break;
            case 2: softmax_compute_subsimd<2>(input_buffer, output_buffer, subsimd_sum); break;
            case 3: softmax_compute_subsimd<3>(input_buffer, output_buffer, subsimd_sum); break;
            case 4: softmax_compute_subsimd<4>(input_buffer, output_buffer, subsimd_sum); break;
            case 5: softmax_compute_subsimd<5>(input_buffer, output_buffer, subsimd_sum); break;
            case 6: softmax_compute_subsimd<6>(input_buffer, output_buffer, subsimd_sum); break;
            case 7: softmax_compute_subsimd<7>(input_buffer, output_buffer, subsimd_sum); break;
            default: NN_UNREACHABLE_CODE;
            }
        }

        {
            __m256 intermediate_sum = _mm256_hadd_ps(acc_sum, acc_sum);
            intermediate_sum = _mm256_permutevar8x32_ps(intermediate_sum, _mm256_set_epi32(0, 1, 4, 5, 2, 3, 6, 7));
            intermediate_sum = _mm256_hadd_ps(intermediate_sum, intermediate_sum);
            intermediate_sum = _mm256_hadd_ps(intermediate_sum, intermediate_sum);

            acc_sum = _mm256_add_ps(intermediate_sum, _mm256_set1_ps(subsimd_sum));
            subsimd_sum = _mm_cvtss_f32(_mm256_extractf128_ps(acc_sum, 0));

            acc_sum = _mm256_div_ps(_mm256_set1_ps(1.0f), acc_sum);
            subsimd_sum = 1.0f / subsimd_sum;
        }

        {
            auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start];

            for (auto block = 0u; block < num_full_blocks; ++block)
            {
                // Run computation.
                softmax_finalize_block<C_max_acc>(output_buffer, acc_sum);
            }

            switch (partial_block_size)
            {
            case  0: break;
            case  1: softmax_finalize_block< 1>(output_buffer, acc_sum); break;
            case  2: softmax_finalize_block< 2>(output_buffer, acc_sum); break;
            case  3: softmax_finalize_block< 3>(output_buffer, acc_sum); break;
            case  4: softmax_finalize_block< 4>(output_buffer, acc_sum); break;
            case  5: softmax_finalize_block< 5>(output_buffer, acc_sum); break;
            case  6: softmax_finalize_block< 6>(output_buffer, acc_sum); break;
            case  7: softmax_finalize_block< 7>(output_buffer, acc_sum); break;
            case  8: softmax_finalize_block< 8>(output_buffer, acc_sum); break;
            case  9: softmax_finalize_block< 9>(output_buffer, acc_sum); break;
            case 10: softmax_finalize_block<10>(output_buffer, acc_sum); break;
            case 11: softmax_finalize_block<11>(output_buffer, acc_sum); break;
            case 12: softmax_finalize_block<12>(output_buffer, acc_sum); break;
            case 13: softmax_finalize_block<13>(output_buffer, acc_sum); break;
            case 14: softmax_finalize_block<14>(output_buffer, acc_sum); break;
            default: NN_UNREACHABLE_CODE;
            }

            switch (subsimd_block_size)
            {
            case 0: break;
            case 1: softmax_finalize_subsimd<1>(output_buffer, subsimd_sum); break;
            case 2: softmax_finalize_subsimd<2>(output_buffer, subsimd_sum); break;
            case 3: softmax_finalize_subsimd<3>(output_buffer, subsimd_sum); break;
            case 4: softmax_finalize_subsimd<4>(output_buffer, subsimd_sum); break;
            case 5: softmax_finalize_subsimd<5>(output_buffer, subsimd_sum); break;
            case 6: softmax_finalize_subsimd<6>(output_buffer, subsimd_sum); break;
            case 7: softmax_finalize_subsimd<7>(output_buffer, subsimd_sum); break;
            default: NN_UNREACHABLE_CODE;
            }
        }
        _mm_free(input_f);
    }
Example #19
0
 INLINE avxi( int a, int b, int c, int d, int e, int f, int g, int h ) : m256(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {}