void static avx2_test (void) { union256i_d s1, s2; union256i_w u; short e[16]; int i; s1.x = _mm256_set_epi32 (1, 2, 3, 4, 65000, 20, 30, 90); s2.x = _mm256_set_epi32 (88, 44, 33, 22, 11, 98, 76, -65000); u.x = _mm256_packs_epi32 (s1.x, s2.x); for (i = 0; i < 4; i++) { e[i] = int_to_short (s1.a[i]); e[i + 4] = int_to_short (s2.a[i]); e[i + 8] = int_to_short (s1.a[i + 4]); e[i + 12] = int_to_short (s2.a[i + 4]); } if (check_union256i_w (u, e)) abort (); }
int main(void) { //_mm256_permutevar_ps __m256 da = _mm256_setr_ps(1,2,3,4,5,6,7,8); __m256i ds = _mm256_set_epi32(0,0,0,0,0,0,0,0x02); __m256 dc; printf("da: "); for(int i=0; i<sizeof(da)/sizeof(da.m256_f32[0]); i++) printf("%5.1f ", da.m256_f32[i]); printf("\n"); dc = _mm256_permutevar_ps(da, ds); printf("dc: "); for(int i=0; i<sizeof(dc)/sizeof(dc.m256_f32[0]); i++) printf("%5.1f ", dc.m256_f32[i]); printf("\n"); ds = _mm256_set_epi32(0,0,0,0,0,0,0,0x01); dc = _mm256_permutevar_ps(da, ds); printf("dc: "); for(int i=0; i<sizeof(dc)/sizeof(dc.m256_f32[0]); i++) printf("%5.1f ", dc.m256_f32[i]); printf("\n\n"); //_mm_permutevar_ps __m128 fa = _mm_setr_ps(1,2,3,4); __m128i fs = _mm_set_epi32(0,0,0,0x02); __m128 fc; printf("fa: "); for(int i=0; i<sizeof(fa)/sizeof(fa.m128_f32[0]); i++) printf("%5.1f ", fa.m128_f32[i]); printf("\n"); fc = _mm_permutevar_ps(fa, fs); printf("fc: "); for(int i=0; i<sizeof(fc)/sizeof(fc.m128_f32[0]); i++) printf("%5.1f ", fc.m128_f32[i]); printf("\n"); fs = _mm_set_epi32(0,0,0,0x01); fc = _mm_permutevar_ps(fa, fs); printf("fc: "); for(int i=0; i<sizeof(fc)/sizeof(fc.m128_f32[0]); i++) printf("%5.1f ", fc.m128_f32[i]); printf("\n"); return 0; }
__m256i inline Read8(const unsigned char* chunk, int offset) { __m256i ret = _mm256_set_epi32( ReadLE32(chunk + 0 + offset), ReadLE32(chunk + 64 + offset), ReadLE32(chunk + 128 + offset), ReadLE32(chunk + 192 + offset), ReadLE32(chunk + 256 + offset), ReadLE32(chunk + 320 + offset), ReadLE32(chunk + 384 + offset), ReadLE32(chunk + 448 + offset) ); return _mm256_shuffle_epi8(ret, _mm256_set_epi32(0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL, 0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL)); }
// Computes part of matrix.vector v = Wu. Computes N=8 results. // For details see PartialMatrixDotVector64 with N=8. static void PartialMatrixDotVector8(const int8_t* wi, const double* scales, const int8_t* u, int num_in, int num_out, double* v) { // Register containing 16-bit ones for horizontal add with 16->32 bit // conversion. __m256i ones = _mm256_set_epi16(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); __m256i shift_id = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1); // Initialize all the results to 0. __m256i result0 = _mm256_setzero_si256(); // Iterate over the input (u), one registerful at a time. for (int j = 0; j < num_in;) { __m256i inputs = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(u + j)); // Inputs are processed in groups of kNumInputsPerGroup, replicated // kNumInputGroups times. for (int ig = 0; ig < kNumInputGroups && j < num_in; ++ig, j += kNumInputsPerGroup) { // Replicate the low 32 bits (4 inputs) 8 times. __m256i rep_input = _mm256_broadcastd_epi32(_mm256_castsi256_si128(inputs)); // Rotate the inputs in groups of 4, so the next 4 inputs are ready. inputs = _mm256_permutevar8x32_epi32(inputs, shift_id); __m256i weights, reps; // Mul-add, with horizontal add of the 4 inputs to each of the results. MultiplyGroup(rep_input, ones, wi, weights, reps, result0); } } ExtractResults(result0, shift_id, wi, scales, num_out, v); }
static void avx2_test (void) { union256i_d u, s1, s2; int e[8]; int i; s1.x = _mm256_set_epi32 (1, 2, 3, 4, 10, 20, 30, 90000); s2.x = _mm256_set_epi32 (88, 44, 3, 22, 11, 98, 76, -100); u.x = _mm256_cmpgt_epi32 (s1.x, s2.x); for (i = 0; i < 8; i++) e[i] = (s1.a[i] > s2.a[i]) ? -1 : 0; if (check_union256i_d (u, e)) abort (); }
void inline Write8(unsigned char* out, int offset, __m256i v) { v = _mm256_shuffle_epi8(v, _mm256_set_epi32(0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL, 0x0C0D0E0FUL, 0x08090A0BUL, 0x04050607UL, 0x00010203UL)); WriteLE32(out + 0 + offset, _mm256_extract_epi32(v, 7)); WriteLE32(out + 32 + offset, _mm256_extract_epi32(v, 6)); WriteLE32(out + 64 + offset, _mm256_extract_epi32(v, 5)); WriteLE32(out + 96 + offset, _mm256_extract_epi32(v, 4)); WriteLE32(out + 128 + offset, _mm256_extract_epi32(v, 3)); WriteLE32(out + 160 + offset, _mm256_extract_epi32(v, 2)); WriteLE32(out + 192 + offset, _mm256_extract_epi32(v, 1)); WriteLE32(out + 224 + offset, _mm256_extract_epi32(v, 0)); }
static void FlowInterExtra_AVX2( uint8_t *pdst8, int dst_pitch, const uint8_t *prefB8, const uint8_t *prefF8, int ref_pitch, const int16_t *VXFullB, const int16_t *VXFullF, const int16_t *VYFullB, const int16_t *VYFullF, const uint8_t *MaskB, const uint8_t *MaskF, int VPitch, int width, int height, int time256, int nPel, const int16_t *VXFullBB, const int16_t *VXFullFF, const int16_t *VYFullBB, const int16_t *VYFullFF) { const PixelType *prefB = (const PixelType *)prefB8; const PixelType *prefF = (const PixelType *)prefF8; PixelType *pdst = (PixelType *)pdst8; ref_pitch /= sizeof(PixelType); dst_pitch /= sizeof(PixelType); int nPelLog = ilog2(nPel); const __m256i dwords_time256 = _mm256_set1_epi32(time256); const __m256i dwords_256_time256 = _mm256_set1_epi32(256 - time256); const __m256i dwords_ref_pitch = _mm256_set1_epi32(ref_pitch); const __m256i dwords_hoffsets = _mm256_set_epi32(7 << nPelLog, 6 << nPelLog, 5 << nPelLog, 4 << nPelLog, 3 << nPelLog, 2 << nPelLog, 1 << nPelLog, 0); const int pixels_per_iteration = 8; const int width_avx2 = width & ~(pixels_per_iteration - 1); for (int h = 0; h < height; h++) { for (int w = 0; w < width_avx2; w += pixels_per_iteration) FlowInterExtra_8px_AVX2(w, pdst, prefB, prefF, VXFullB, VXFullF, VYFullB, VYFullF, MaskB, MaskF, nPelLog, VXFullBB, VXFullFF, VYFullBB, VYFullFF, dwords_time256, dwords_256_time256, dwords_ref_pitch, dwords_hoffsets); if (width_avx2 < width) FlowInterExtra_8px_AVX2(width - pixels_per_iteration, pdst, prefB, prefF, VXFullB, VXFullF, VYFullB, VYFullF, MaskB, MaskF, nPelLog, VXFullBB, VXFullFF, VYFullBB, VYFullFF, dwords_time256, dwords_256_time256, dwords_ref_pitch, dwords_hoffsets); pdst += dst_pitch; prefB += ref_pitch << nPelLog; prefF += ref_pitch << nPelLog; VXFullB += VPitch; VYFullB += VPitch; VXFullF += VPitch; VYFullF += VPitch; MaskB += VPitch; MaskF += VPitch; VXFullBB += VPitch; VYFullBB += VPitch; VXFullFF += VPitch; VYFullFF += VPitch; } }
// Compare rank with all values currently in the queue. Returns -1 if the value already exists // or is larger than all values. // Otherwise, returns the index of the register in which the value should be inserted. // Mask is replicated to both lanes, so it can be used for both value and rank lane. int PriorityQueue_AVX2::compare(__m256i mrank, int &field, __m256i >mask) { static const __m256i eq4mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); __m256i eq, eq4; int reg, mask; // Because items are sorted in ascending order within each (double) register, the mask after GT // comparison must be of the form 000...1111, which is one less than a power of two. { __m256i r0_7 = _mm256_permute2x128_si256(_rv[1], _rv[0], 0x20); // [0 .. 7] gtmask = _mm256_cmpgt_epi32(r0_7, mrank); mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask)); eq = _mm256_cmpeq_epi32(r0_7, mrank); _ASSERTE(((mask + 1) & mask) == 0); reg = 1; } if (!mask) { __m256i r8_15 = _mm256_permute2x128_si256(_rv[3], _rv[2], 0x20); // [8 .. 15] gtmask = _mm256_cmpgt_epi32(r8_15, mrank); mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask)); eq = _mm256_or_si256(eq, _mm256_cmpeq_epi32(r8_15, mrank)); _ASSERTE(((mask + 1) & mask) == 0); reg = 3; } if (!mask) { gtmask = _mm256_cmpgt_epi32(_rv[4], mrank); // [16 .. 19]; don't care about value eq4 = _mm256_and_si256(eq4mask, _mm256_cmpeq_epi32(mrank, _rv[4])); // .. ditto mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask)) & 0xF; // ignore comparison with values eq = _mm256_or_si256(eq, eq4); _ASSERTE(((mask + 1) & mask) == 0); reg = 4; } if (_mm256_movemask_ps(_mm256_castsi256_ps(eq)) != 0) mask = 0; if (!mask) return -1; // Adjust register according to mask (higher 128-bits i double register: one register lower) // There is no "previous" register to test against for equality if we need to insert in the // very first register. Also duplicate the same mask to both lanes. if (mask > 0xF) { mask >>= 4; --reg; gtmask = _mm256_permute2x128_si256(gtmask, gtmask, 0x11); // replicate high lane to both }
/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */ static void shuffle4_avx2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 4; size_t i; int j; __m256i ymm0[4], ymm1[4]; /* Create the shuffle mask. NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from most to least significant (i.e., their order is reversed when compared to loading the mask from an array). */ const __m256i mask = _mm256_set_epi32( 0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); for (i = 0; i < vectorizable_elements; i += sizeof(__m256i)) { /* Fetch 32 elements (128 bytes) then transpose bytes and words. */ for (j = 0; j < 4; j++) { ymm0[j] = _mm256_loadu_si256((__m256i*)(src + (i * bytesoftype) + (j * sizeof(__m256i)))); ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0xd8); ymm0[j] = _mm256_shuffle_epi32(ymm0[j], 0x8d); ymm0[j] = _mm256_unpacklo_epi8(ymm1[j], ymm0[j]); ymm1[j] = _mm256_shuffle_epi32(ymm0[j], 0x04e); ymm0[j] = _mm256_unpacklo_epi16(ymm0[j], ymm1[j]); } /* Transpose double words */ for (j = 0; j < 2; j++) { ymm1[j*2] = _mm256_unpacklo_epi32(ymm0[j*2], ymm0[j*2+1]); ymm1[j*2+1] = _mm256_unpackhi_epi32(ymm0[j*2], ymm0[j*2+1]); } /* Transpose quad words */ for (j = 0; j < 2; j++) { ymm0[j*2] = _mm256_unpacklo_epi64(ymm1[j], ymm1[j+2]); ymm0[j*2+1] = _mm256_unpackhi_epi64(ymm1[j], ymm1[j+2]); } for (j = 0; j < 4; j++) { ymm0[j] = _mm256_permutevar8x32_epi32(ymm0[j], mask); } /* Store the result vectors */ uint8_t* const dest_for_ith_element = dest + i; for (j = 0; j < 4; j++) { _mm256_storeu_si256((__m256i*)(dest_for_ith_element + (j * total_elements)), ymm0[j]); } } }
static inline void do_encode_12bytes(const char (*alphabet)[2], char *out, __m256i chunk) { const __m256i shufflemask = _mm256_set_epi8( -1, 9, 10, 11, -1, 9, 10, 11, -1, 6, 7, 8, -1, 6, 7, 8, -1, 3, 4, 5, -1, 3, 4, 5, -1, 0, 1, 2, -1, 0, 1, 2 ); const __m256i shifts = _mm256_set_epi32(0, 12, 0, 12, 0, 12, 0, 12); const __m256i masks = _mm256_set1_epi32(4095); // convert from big endian and rearrange the bytes chunk = _mm256_shuffle_epi8(chunk, shufflemask); chunk = _mm256_srlv_epi32(chunk, shifts); chunk = _mm256_and_si256(chunk, masks); // write the two halves to memory do_encode_6bytes(alphabet, out + 0, _mm256_extracti128_si256(chunk, 0)); do_encode_6bytes(alphabet, out + 8, _mm256_extracti128_si256(chunk, 1)); }
double bst_compute_129_m256_maskstore_root_aligned( void*_bst_obj, double* p, double* q, size_t nn ) { segments_t* mem = (segments_t*) _bst_obj; int n, i, r, l_end, j, l_end_pre; double t, e_tmp; double* e = mem->e, *w = mem->w; int* root = mem->r; __m256d v_tmp; __m256d v00, v01, v02, v03; __m256d v10, v11, v12, v13; __m256d v20, v21, v22, v23; __m256d v30, v31, v32, v33; __m256i v_cur_roots; __m256 v_rootmask0, v_rootmask1; // initialization // mem->n = nn; n = nn; // subtractions with n potentially negative. say hello to all the bugs int idx1, idx1_root; int idx2; int idx3, idx3_root; int pad_root, pad, pad_r; idx1 = ((int) mem->e_sz) - 1; idx1_root = ((int) mem->r_sz); // the conventio is that iteration i, idx1 points to the first element of line i+1 e[idx1++] = q[n]; // pad contains the padding for row i+1 // for row n it's always 3 pad = 3; pad_root = 7; for (i = n-1; i >= 0; --i) { idx1 -= 2*(n-i)+1 + pad; idx1_root -= 2*(n-i)+1 + pad_root; idx2 = idx1 + 1; e[idx1] = q[i]; w[idx1] = q[i]; for (j = i+1; j < n+1; ++j,++idx2) { e[idx2] = INFINITY; w[idx2] = w[idx2-1] + p[j-1] + q[j]; } idx2 += pad; // padding of line i+1 // idx2 now points to the first element of the next line idx3 = idx1; idx3_root = idx1_root; pad_r = pad; for (r = i; r < n; ++r) { pad_r = (pad_r+1)&3; // padding of line r+1 idx1 = idx3; idx1_root = idx3_root; l_end = idx2 + (n-r); // l_end points to the first entry after the current row e_tmp = e[idx1++]; idx1_root++; // calculate until a multiple of 8 doubles is left // 8 = 4 * 2 128-bit vectors l_end_pre = idx2 + ((n-r)&15); for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) { t = e_tmp + e[idx2] + w[idx1]; if (t < e[idx1]) { e[idx1] = t; root[idx1_root] = r; } idx1++; idx1_root++; } v_tmp = _mm256_set_pd( e_tmp, e_tmp, e_tmp, e_tmp ); // execute the shit for 4 vectors of size 2 v_cur_roots = _mm256_set_epi32(r, r, r, r, r, r, r, r); for( ; idx2 < l_end; idx2 += 16 ) { v01 = _mm256_load_pd( &w[idx1 ] ); v11 = _mm256_load_pd( &w[idx1+ 4] ); v21 = _mm256_load_pd( &w[idx1+ 8] ); v31 = _mm256_load_pd( &w[idx1+12] ); v00 = _mm256_load_pd( &e[idx2 ] ); v01 = _mm256_add_pd( v01, v_tmp ); v10 = _mm256_load_pd( &e[idx2+ 4] ); v11 = _mm256_add_pd( v11, v_tmp ); v20 = _mm256_load_pd( &e[idx2+ 8] ); v21 = _mm256_add_pd( v21, v_tmp ); v30 = _mm256_load_pd( &e[idx2+12] ); v31 = _mm256_add_pd( v31, v_tmp ); v01 = _mm256_add_pd( v01, v00 ); v03 = _mm256_load_pd( &e[idx1 ] ); v11 = _mm256_add_pd( v11, v10 ); v13 = _mm256_load_pd( &e[idx1+ 4] ); v21 = _mm256_add_pd( v21, v20 ); v23 = _mm256_load_pd( &e[idx1+ 8] ); v31 = _mm256_add_pd( v31, v30 ); v33 = _mm256_load_pd( &e[idx1+12] ); v02 = _mm256_cmp_pd( v01, v03, _CMP_LT_OQ ); v12 = _mm256_cmp_pd( v11, v13, _CMP_LT_OQ ); v22 = _mm256_cmp_pd( v21, v23, _CMP_LT_OQ ); v32 = _mm256_cmp_pd( v31, v33, _CMP_LT_OQ ); _mm256_maskstore_pd( &e[idx1 ], _mm256_castpd_si256( v02 ), v01 ); _mm256_maskstore_pd( &e[idx1+ 4], _mm256_castpd_si256( v12 ), v11 ); v_rootmask0 = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm256_cvtpd_ps(v02)), _mm256_cvtpd_ps(v12) , 1 ); _mm256_maskstore_pd( &e[idx1+ 8], _mm256_castpd_si256( v22 ), v21 ); _mm256_maskstore_pd( &e[idx1+12], _mm256_castpd_si256( v32 ), v31 ); v_rootmask1 = _mm256_insertf128_ps( _mm256_castps128_ps256( _mm256_cvtpd_ps(v22)), _mm256_cvtpd_ps(v32) , 1 ); _mm256_maskstore_ps( &root[idx1_root ], _mm256_castps_si256( v_rootmask0 ), _mm256_castsi256_ps( v_cur_roots ) ); _mm256_maskstore_ps( &root[idx1_root + 8], _mm256_castps_si256( v_rootmask1 ), _mm256_castsi256_ps( v_cur_roots ) ); idx1 += 16; idx1_root += 16; } idx2 += pad_r; idx3++; idx3_root++; } pad = (pad -1)&3; pad_root = (pad_root-1)&7; } // the index of the last item of the first row is ((n/4)+1)*4-1, due to the padding // if n is even, the total number of entries in the first // row of the table is odd, so we need padding return e[ ((n/4)+1)*4 - 1 ]; }
parasail_result_t *result = parasail_result_new(); #endif #endif int32_t i = 0; int32_t j = 0; int32_t end_query = 0; int32_t end_ref = 0; int32_t score = NEG_INF; __m256i vNegInf = _mm256_set1_epi32(NEG_INF); __m256i vOpen = _mm256_set1_epi32(open); __m256i vGap = _mm256_set1_epi32(gap); __m256i vOne = _mm256_set1_epi32(1); __m256i vN = _mm256_set1_epi32(N); __m256i vGapN = _mm256_set1_epi32(gap*N); __m256i vNegOne = _mm256_set1_epi32(-1); __m256i vI = _mm256_set_epi32(0,1,2,3,4,5,6,7); __m256i vJreset = _mm256_set_epi32(0,-1,-2,-3,-4,-5,-6,-7); __m256i vMax = vNegInf; __m256i vILimit = _mm256_set1_epi32(s1Len); __m256i vILimit1 = _mm256_sub_epi32(vILimit, vOne); __m256i vJLimit = _mm256_set1_epi32(s2Len); __m256i vJLimit1 = _mm256_sub_epi32(vJLimit, vOne); __m256i vIBoundary = _mm256_set_epi32( -open-0*gap, -open-1*gap, -open-2*gap, -open-3*gap, -open-4*gap, -open-5*gap, -open-6*gap, -open-7*gap
foo (int x1, int x2, int x3, int x4, int x5, int x6, int x7, int x8) { return _mm256_set_epi32 (x1, x2, x3, x4, x5, x6, x7, x8); }
INLINE avxi( StepTy ) : m256(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)) {}
#endif #endif int32_t i = 0; int32_t j = 0; int32_t end_query = 0; int32_t end_ref = 0; int32_t score = NEG_INF; __m256i vNegInf = _mm256_set1_epi32(NEG_INF); __m256i vNegInf0 = _mm256_srli_si256_rpl(vNegInf, 4); /* shift in a 0 */ __m256i vOpen = _mm256_set1_epi32(open); __m256i vGap = _mm256_set1_epi32(gap); __m256i vZero = _mm256_set1_epi32(0); __m256i vOne = _mm256_set1_epi32(1); __m256i vN = _mm256_set1_epi32(N); __m256i vNegOne = _mm256_set1_epi32(-1); __m256i vI = _mm256_set_epi32(0,1,2,3,4,5,6,7); __m256i vJreset = _mm256_set_epi32(0,-1,-2,-3,-4,-5,-6,-7); __m256i vMax = vNegInf; __m256i vEndI = vNegInf; __m256i vEndJ = vNegInf; __m256i vILimit = _mm256_set1_epi32(s1Len); __m256i vJLimit = _mm256_set1_epi32(s2Len); /* convert _s1 from char to int in range 0-23 */ for (i=0; i<s1Len; ++i) { s1[i] = matrix->mapper[(unsigned char)_s1[i]]; } /* pad back of s1 with dummy values */ for (i=s1Len; i<s1Len_PAD; ++i) { s1[i] = 0; /* point to first matrix row because we don't care */
void Decoder::ADMMDecoder_deg_6_7_2_3_6() { int maxIter = maxIteration; float mu = 5.5f; float tableau[12] = { 0.0f }; if ((mBlocklength == 576) && (mNChecks == 288)) { mu = 3.37309f;//penalty tableau[2] = 0.00001f; tableau[3] = 2.00928f; tableau[6] = 4.69438f; } else if((mBlocklength == 2304) && (mNChecks == 1152) ) { mu = 3.81398683f;//penalty tableau[2] = 0.29669288f; tableau[3] = 0.46964023f; tableau[6] = 3.19548154f; } else { mu = 5.5;//penalty tableau[2] = 0.8f; tableau[3] = 0.8f; tableau[6] = 0.8f; } const float rho = 1.9f; //over relaxation parameter; const float un_m_rho = 1.0 - rho; const auto _rho = _mm256_set1_ps( rho ); const auto _un_m_rho = _mm256_set1_ps( un_m_rho ); float tableaX[12]; // // ON PRECALCULE LES CONSTANTES // #pragma unroll for (int i = 0; i < 7; i++) { tableaX[i] = tableau[ i ] / mu; } const auto t_mu = _mm256_set1_ps ( mu ); const auto t2_amu = _mm256_set1_ps ( tableau[ 2 ] / mu ); const auto t3_amu = _mm256_set1_ps ( tableau[ 3 ] / mu ); const auto t6_amu = _mm256_set1_ps ( tableau[ 6 ] / mu ); const auto t2_2amu = _mm256_set1_ps ( 2.0f * tableau[ 2 ] / mu ); const auto t3_2amu = _mm256_set1_ps ( 2.0f * tableau[ 3 ] / mu ); const auto t6_2amu = _mm256_set1_ps ( 2.0f * tableau[ 6 ] / mu ); const auto t2_deg = _mm256_set1_ps ( 2.0f ); const auto t3_deg = _mm256_set1_ps ( 3.0f ); const auto t6_deg = _mm256_set1_ps ( 6.0f ); const auto zero = _mm256_set1_ps ( 0.0f ); const auto un = _mm256_set1_ps ( 1.0f ); const __m256 a = _mm256_set1_ps ( 0.0f ); const __m256 b = _mm256_set1_ps ( 0.5f ); ////////////////////////////////////////////////////////////////////////////////////// #pragma unroll for( int j = 0; j < _mPCheckMapSize; j+=8 ) { _mm256_store_ps(&Lambda [j], a); _mm256_store_ps(&zReplica[j], b); _mm256_store_ps(&latestProjVector[j], b); } ////////////////////////////////////////////////////////////////////////////////////// for(int i = 0; i < maxIter; i++) { int ptr = 0; mIteration = i + 1; // // MEASURE OF THE VN EXECUTION TIME // #ifdef PROFILE_ON const auto start = timer(); #endif // // VN processing kernel // #pragma unroll for (int j = 0; j < _mBlocklength; j++) { const int degVn = VariableDegree[j]; float M[8] __attribute__((aligned(64))); if( degVn == 2 ){ #if 1 const int dVN = 2; for(int qq = 0; qq < 8; qq++) { M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; #pragma unroll for(int k = 1; k < dVN; k++) { M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; } } const auto m = _mm256_loadu_ps( M ); const auto llr = _mm256_loadu_ps( &_LogLikelihoodRatio[j] ); const auto t1 = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu)); const auto xx = _mm256_div_ps(_mm256_sub_ps(t1, t2_amu), _mm256_sub_ps(t2_deg, t2_2amu)); const auto vMin = _mm256_max_ps(_mm256_min_ps(xx, un) , zero); _mm256_storeu_ps(&OutputFromDecoder[j], vMin); j += 7; #else const int degVN = 2; float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); #pragma unroll for(int k = 1; k < degVN; k++) temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]); ptr += degVN; const float _amu_ = tableaX[ degVN ]; const float _2_amu_ = _amu_+ _amu_; const float llr = _LogLikelihoodRatio[j]; const float t = temp - llr / mu; const float xx = (t - _amu_)/(degVn - _2_amu_); const float vMax = std::min(xx, 1.0f); const float vMin = std::max(vMax, 0.0f); OutputFromDecoder[j] = vMin; #endif }else if( degVn == 3 ){ #if 1 const int dVN = 3; for(int qq = 0; qq < 8; qq++) { M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; #pragma unroll for(int k = 1; k < dVN; k++) { M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; } } const auto m = _mm256_loadu_ps( M ); const auto llr = _mm256_loadu_ps( &_LogLikelihoodRatio[j] ); const auto t1 = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu)); const auto xx = _mm256_div_ps(_mm256_sub_ps(t1, t3_amu), _mm256_sub_ps(t3_deg, t3_2amu)); const auto vMin = _mm256_max_ps(_mm256_min_ps(xx, un) , zero); _mm256_storeu_ps(&OutputFromDecoder[j], vMin); j += 7; #else const int degVN = 3; float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); #pragma unroll for(int k = 1; k < degVN; k++) temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]); ptr += degVN; const float _amu_ = tableaX[ degVN ]; const float _2_amu_ = _amu_+ _amu_; const float llr = _LogLikelihoodRatio[j]; const float t = temp - llr / mu; const float xx = (t - _amu_)/(degVn - _2_amu_); const float vMax = std::min(xx, 1.0f); const float vMin = std::max(vMax, 0.0f); OutputFromDecoder[j] = vMin; #endif }else if( degVn == 6 ){ #if 1 const int dVN = 6; for(int qq = 0; qq < 8; qq++) { M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; #pragma unroll for(int k = 1; k < dVN; k++) { M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr += 1; } } const auto m = _mm256_loadu_ps( M ); const auto llr = _mm256_loadu_ps( &_LogLikelihoodRatio[j] ); const auto t1 = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu)); const auto xx = _mm256_div_ps(_mm256_sub_ps(t1, t6_amu), _mm256_sub_ps(t6_deg, t6_2amu)); const auto vMin = _mm256_max_ps(_mm256_min_ps(xx, un) , zero); _mm256_storeu_ps(&OutputFromDecoder[j], vMin); j += 7; #else const int degVN = 6; float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); #pragma unroll for(int k = 1; k < degVN; k++) temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]); ptr += degVN; const float _amu_ = tableaX[ degVN ]; const float _2_amu_ = _amu_+ _amu_; const float llr = _LogLikelihoodRatio[j]; const float t = temp - llr / mu; const float xx = (t - _amu_)/(degVn - _2_amu_); const float vMax = std::min(xx, 1.0f); const float vMin = std::max(vMax, 0.0f); OutputFromDecoder[j] = vMin; #endif } } // // MEASURE OF THE VN EXECUTION TIME // #ifdef PROFILE_ON t_vn += (timer() - start); #endif // // CN processing kernel // int CumSumCheckDegree = 0; // cumulative position of currect edge in factor graph int allVerified = 0; float vector_before_proj[8] __attribute__((aligned(64))); const auto zero = _mm256_set1_ps ( 0.0f ); const auto mask_6 = _mm256_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); const auto mask_7 = _mm256_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); const auto dot5 = _mm256_set1_ps( 0.5f ); // // MEASURE OF THE CN EXECUTION TIME // #ifdef PROFILE_ON const auto starT = timer(); #endif const auto seuilProj = _mm256_set1_ps( 1e-5f ); for(int j = 0; j < _mNChecks; j++) { if( CheckDegree[j] == 6 ){ const int cDeg6 = 0x3F; const auto offsets = _mm256_loadu_si256 ((const __m256i*)&t_col1 [CumSumCheckDegree]); const auto xpred = _mm256_mask_i32gather_ps (zero, OutputFromDecoder, offsets, _mm256_castsi256_ps(mask_6), 4); const auto synd = _mm256_cmp_ps( xpred, dot5, _CMP_GT_OS ); int test = (_mm256_movemask_ps( synd ) & cDeg6); // deg 6 const auto syndrom = _mm_popcnt_u32( test ); const auto _Replica = _mm256_loadu_ps( &zReplica[CumSumCheckDegree]); const auto _ambda = _mm256_loadu_ps( &Lambda [CumSumCheckDegree]); const auto v1 = _mm256_mul_ps (xpred, _rho ); const auto v2 = _mm256_mul_ps ( _Replica, _un_m_rho ); const auto v3 = _mm256_add_ps ( v1, v2 ); const auto vect_proj = _mm256_sub_ps ( v3, _ambda ); // // ON REALISE LA PROJECTION !!! // allVerified += ( syndrom & 0x01 ); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON const auto START = timer(); #endif const auto latest = _mm256_loadu_ps(&latestProjVector[CumSumCheckDegree]); const auto different = _mm256_sub_ps ( vect_proj, latest ); const auto maskAbsol = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const auto absolute = _mm256_and_ps ( different, maskAbsol ); const auto despass = _mm256_cmp_ps( absolute, seuilProj, _CMP_GT_OS ); int skip = (_mm256_movemask_ps( despass ) & cDeg6) == 0x00; // degree 6 if( skip == false ) { const auto _ztemp = mp.projection_deg6( vect_proj ); const auto _ztemp1 = _mm256_sub_ps(_ztemp, xpred ); const auto _ztemp2 = _mm256_sub_ps(_ztemp, _Replica ); const auto _ztemp3 = _mm256_mul_ps(_ztemp1, _rho); const auto _ztemp4 = _mm256_mul_ps(_ztemp2, _un_m_rho); const auto nLambda = _mm256_add_ps( _ambda, _ztemp3 ); const auto mLambda = _mm256_add_ps( nLambda, _ztemp4 ); _mm256_maskstore_ps(& Lambda[CumSumCheckDegree], mask_6, mLambda); _mm256_maskstore_ps(&zReplica[CumSumCheckDegree], mask_6, _ztemp); } _mm256_maskstore_ps(&latestProjVector[CumSumCheckDegree], mask_6, vect_proj); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON t_pj += (timer() - START); #endif CumSumCheckDegree += 6; }else if( CheckDegree[j] == 7 ) { const int cDeg7 = 0x7F; const auto offsets = _mm256_loadu_si256 ((const __m256i*)&t_col1 [CumSumCheckDegree]); const auto xpred = _mm256_mask_i32gather_ps (zero, OutputFromDecoder, offsets, _mm256_castsi256_ps(mask_7), 4); const auto synd = _mm256_cmp_ps( xpred, dot5, _CMP_GT_OS ); const int test = (_mm256_movemask_ps( synd ) & cDeg7); // deg 7 const auto syndrom = _mm_popcnt_u32( test ); const auto _Replica = _mm256_loadu_ps( &zReplica[CumSumCheckDegree]); const auto _ambda = _mm256_loadu_ps( &Lambda [CumSumCheckDegree]); const auto v1 = _mm256_mul_ps ( xpred, _rho ); const auto v2 = _mm256_mul_ps ( _Replica, _un_m_rho ); const auto v3 = _mm256_add_ps ( v1, v2 ); const auto vect_proj = _mm256_sub_ps ( v3, _ambda ); // // ON REALISE LA PROJECTION !!! // allVerified += ( syndrom & 0x01 ); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON const auto START = timer(); #endif const auto latest = _mm256_loadu_ps(&latestProjVector[CumSumCheckDegree]); const auto different = _mm256_sub_ps ( vect_proj, latest ); const auto maskAbsol = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); const auto absolute = _mm256_and_ps ( different, maskAbsol ); const auto despass = _mm256_cmp_ps( absolute, seuilProj, _CMP_GT_OS ); int skip = (_mm256_movemask_ps( despass ) & cDeg7) == 0x00; // degree 7 if( skip == false ) { const auto _ztemp = mp.projection_deg7( vect_proj ); const auto _ztemp1 = _mm256_sub_ps(_ztemp, xpred ); const auto _ztemp2 = _mm256_sub_ps(_ztemp, _Replica ); const auto _ztemp3 = _mm256_mul_ps(_ztemp1, _rho); const auto _ztemp4 = _mm256_mul_ps(_ztemp2, _un_m_rho); const auto nLambda = _mm256_add_ps( _ambda, _ztemp3 ); const auto mLambda = _mm256_add_ps( nLambda, _ztemp4 ); _mm256_maskstore_ps(& Lambda [CumSumCheckDegree], mask_7, mLambda); _mm256_maskstore_ps(&zReplica [CumSumCheckDegree], mask_7, _ztemp); } _mm256_maskstore_ps(&latestProjVector[CumSumCheckDegree], mask_7, vect_proj); // // MEASURE OF THE PROJECTION EXECUTION TIME // #ifdef PROFILE_ON t_pj += (timer() - START); #endif CumSumCheckDegree += 7; }else{ exit( 0 ); } } // // MEASURE OF THE CN LOOP EXECUTION TIME // #ifdef PROFILE_ON t_cn += (timer() - starT); #endif #ifdef PROFILE_ON t_ex += 1; //FILE *ft=fopen("time.txt","a"); //fprintf(ft,"%d \n", t_cn/t_ex); //fprintf(ft,"%d %d %d \n", t_cn, t_vn, t_pj); //fclose(ft); #endif if(allVerified == 0) { mAlgorithmConverge = true; mValidCodeword = true; break; } } // // MEASURE OF THE NUMBER OF EXECUTION // // #ifdef PROFILE_ON // t_ex += 1; // #endif }
_mm_set_epi32(0x00200000, 0x00000000, 0x00000000, 0x00000000), _mm_set_epi32(0x00400000, 0x00000000, 0x00000000, 0x00000000), _mm_set_epi32(0x00800000, 0x00000000, 0x00000000, 0x00000000), _mm_set_epi32(0x01000000, 0x00000000, 0x00000000, 0x00000000), _mm_set_epi32(0x02000000, 0x00000000, 0x00000000, 0x00000000), _mm_set_epi32(0x04000000, 0x00000000, 0x00000000, 0x00000000), _mm_set_epi32(0x08000000, 0x00000000, 0x00000000, 0x00000000), _mm_set_epi32(0x10000000, 0x00000000, 0x00000000, 0x00000000), _mm_set_epi32(0x20000000, 0x00000000, 0x00000000, 0x00000000), _mm_set_epi32(0x40000000, 0x00000000, 0x00000000, 0x00000000), _mm_set_epi32(0x80000000, 0x00000000, 0x00000000, 0x00000000) }; #ifdef AVX2 __m256i sseMasks256[256] = { _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000001), _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000002), _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000004), _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000008), _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000010), _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000020), _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000040), _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000080), _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000100), _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000200), _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000400), _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000800), _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00001000), _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00002000), _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00004000), _mm256_set_epi32(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00008000),
void run_softmax_int32_float_work_item_latency(nn_workload_item *const work_item) { nn_workload_data_t *input_view = work_item->input[0]->output; const auto &arguments = work_item->arguments.forward_softmax_fixedpoint; const auto input_width = input_view->parent->lengths.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p]; const auto output_width = work_item->output->view_end.t[NN_DATA_COORD_x] - work_item->output->view_begin.t[NN_DATA_COORD_x] + 1; const auto num_full_blocks = output_width / C_data_stride; const auto partial_block_size = (output_width / C_simd_width) % C_max_acc; const auto subsimd_block_size = output_width % C_simd_width; const auto output_view_start = work_item->output->view_begin.t[NN_DATA_COORD_x]; const auto input_view_start = input_view->view_begin.t[NN_DATA_COORD_z] * input_view->parent->lengths.t[NN_DATA_COORD_p]; const auto out_fraction = arguments.input_fraction; float * input_f = (float*)_mm_malloc(input_width * sizeof(float), 64); auto input_buffer = &static_cast<int32_t*>(input_view->parent->data_buffer)[input_view_start]; auto shift = out_fraction; if (shift > 0) { for (uint32_t i = 0; i < input_width; i++) input_f[i] = (float)(input_buffer[i]) / (1 << shift); } else if (shift < 0) { for (uint32_t i = 0; i < input_width; i++) input_f[i] = (float)(input_buffer[i]) * (1 << -shift); } else { for (uint32_t i = 0; i < input_width; i++) input_f[i] = (float)(input_buffer[i]); } __m256 acc_sum = _mm256_setzero_ps(); float subsimd_sum = 0.0f; { auto input_buffer = input_f; auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start]; for (auto block = 0u; block < num_full_blocks; ++block) { // Run computation. softmax_compute_block<C_max_acc>(input_buffer, output_buffer, acc_sum); } switch (partial_block_size) { case 0: break; case 1: softmax_compute_block< 1>(input_buffer, output_buffer, acc_sum); break; case 2: softmax_compute_block< 2>(input_buffer, output_buffer, acc_sum); break; case 3: softmax_compute_block< 3>(input_buffer, output_buffer, acc_sum); break; case 4: softmax_compute_block< 4>(input_buffer, output_buffer, acc_sum); break; case 5: softmax_compute_block< 5>(input_buffer, output_buffer, acc_sum); break; case 6: softmax_compute_block< 6>(input_buffer, output_buffer, acc_sum); break; case 7: softmax_compute_block< 7>(input_buffer, output_buffer, acc_sum); break; case 8: softmax_compute_block< 8>(input_buffer, output_buffer, acc_sum); break; case 9: softmax_compute_block< 9>(input_buffer, output_buffer, acc_sum); break; case 10: softmax_compute_block<10>(input_buffer, output_buffer, acc_sum); break; case 11: softmax_compute_block<11>(input_buffer, output_buffer, acc_sum); break; case 12: softmax_compute_block<12>(input_buffer, output_buffer, acc_sum); break; case 13: softmax_compute_block<13>(input_buffer, output_buffer, acc_sum); break; case 14: softmax_compute_block<14>(input_buffer, output_buffer, acc_sum); break; default: NN_UNREACHABLE_CODE; } switch (subsimd_block_size) { case 0: break; case 1: softmax_compute_subsimd<1>(input_buffer, output_buffer, subsimd_sum); break; case 2: softmax_compute_subsimd<2>(input_buffer, output_buffer, subsimd_sum); break; case 3: softmax_compute_subsimd<3>(input_buffer, output_buffer, subsimd_sum); break; case 4: softmax_compute_subsimd<4>(input_buffer, output_buffer, subsimd_sum); break; case 5: softmax_compute_subsimd<5>(input_buffer, output_buffer, subsimd_sum); break; case 6: softmax_compute_subsimd<6>(input_buffer, output_buffer, subsimd_sum); break; case 7: softmax_compute_subsimd<7>(input_buffer, output_buffer, subsimd_sum); break; default: NN_UNREACHABLE_CODE; } } { __m256 intermediate_sum = _mm256_hadd_ps(acc_sum, acc_sum); intermediate_sum = _mm256_permutevar8x32_ps(intermediate_sum, _mm256_set_epi32(0, 1, 4, 5, 2, 3, 6, 7)); intermediate_sum = _mm256_hadd_ps(intermediate_sum, intermediate_sum); intermediate_sum = _mm256_hadd_ps(intermediate_sum, intermediate_sum); acc_sum = _mm256_add_ps(intermediate_sum, _mm256_set1_ps(subsimd_sum)); subsimd_sum = _mm_cvtss_f32(_mm256_extractf128_ps(acc_sum, 0)); acc_sum = _mm256_div_ps(_mm256_set1_ps(1.0f), acc_sum); subsimd_sum = 1.0f / subsimd_sum; } { auto output_buffer = &static_cast<float*>(work_item->output->parent->data_buffer)[output_view_start]; for (auto block = 0u; block < num_full_blocks; ++block) { // Run computation. softmax_finalize_block<C_max_acc>(output_buffer, acc_sum); } switch (partial_block_size) { case 0: break; case 1: softmax_finalize_block< 1>(output_buffer, acc_sum); break; case 2: softmax_finalize_block< 2>(output_buffer, acc_sum); break; case 3: softmax_finalize_block< 3>(output_buffer, acc_sum); break; case 4: softmax_finalize_block< 4>(output_buffer, acc_sum); break; case 5: softmax_finalize_block< 5>(output_buffer, acc_sum); break; case 6: softmax_finalize_block< 6>(output_buffer, acc_sum); break; case 7: softmax_finalize_block< 7>(output_buffer, acc_sum); break; case 8: softmax_finalize_block< 8>(output_buffer, acc_sum); break; case 9: softmax_finalize_block< 9>(output_buffer, acc_sum); break; case 10: softmax_finalize_block<10>(output_buffer, acc_sum); break; case 11: softmax_finalize_block<11>(output_buffer, acc_sum); break; case 12: softmax_finalize_block<12>(output_buffer, acc_sum); break; case 13: softmax_finalize_block<13>(output_buffer, acc_sum); break; case 14: softmax_finalize_block<14>(output_buffer, acc_sum); break; default: NN_UNREACHABLE_CODE; } switch (subsimd_block_size) { case 0: break; case 1: softmax_finalize_subsimd<1>(output_buffer, subsimd_sum); break; case 2: softmax_finalize_subsimd<2>(output_buffer, subsimd_sum); break; case 3: softmax_finalize_subsimd<3>(output_buffer, subsimd_sum); break; case 4: softmax_finalize_subsimd<4>(output_buffer, subsimd_sum); break; case 5: softmax_finalize_subsimd<5>(output_buffer, subsimd_sum); break; case 6: softmax_finalize_subsimd<6>(output_buffer, subsimd_sum); break; case 7: softmax_finalize_subsimd<7>(output_buffer, subsimd_sum); break; default: NN_UNREACHABLE_CODE; } } _mm_free(input_f); }
INLINE avxi( int a, int b, int c, int d, int e, int f, int g, int h ) : m256(_mm256_set_epi32(h, g, f, e, d, c, b, a)) {}