static inline unsigned int evaluatePopcount(INT_TYPE v_N, char *precomputed) { #ifdef __AVX unsigned long int res[4] __attribute__ ((aligned (BYTE_ALIGNMENT))); unsigned int a, b; _mm256_store_pd((double*)res, v_N); a = __builtin_popcountl(res[0]) + __builtin_popcountl(res[1]); b = __builtin_popcountl(res[2]) + __builtin_popcountl(res[3]); return (a + b); #else unsigned int sum = 0, counts[INTS_PER_VECTOR] __attribute__ ((aligned (BYTE_ALIGNMENT))); VECTOR_STORE((CAST)counts, v_N); sum += BIT_COUNT(counts[0], precomputed) + BIT_COUNT(counts[1], precomputed); sum += BIT_COUNT(counts[2], precomputed) + BIT_COUNT(counts[3], precomputed); return sum; #endif }
static inline unsigned int vectorPopcount(INT_TYPE v) { unsigned long counts[LONG_INTS_PER_VECTOR] __attribute__ ((aligned (PLL_BYTE_ALIGNMENT))); int i, sum = 0; VECTOR_STORE((CAST)counts, v); for(i = 0; i < LONG_INTS_PER_VECTOR; i++) sum += __builtin_popcountl(counts[i]); return ((unsigned int)sum); }
static inline void storePerSiteScores (partitionList * pr, int model, INT_TYPE v, unsigned int offset) { unsigned long counts[LONG_INTS_PER_VECTOR] __attribute__ ((aligned (PLL_BYTE_ALIGNMENT))); parsimonyNumber * buf; int i, j; VECTOR_STORE((CAST)counts, v); for (i = 0; i < LONG_INTS_PER_VECTOR; ++i) { buf = &(pr->partitionData[model]->perSiteParsScores[offset * PLL_PCF + i * ULINT_SIZE]); for (j = 0; j < ULINT_SIZE; ++ j) buf[j] += ((counts[i] >> j) & 1); } }
static void newviewParsimonyIterativeFast(tree *tr) { INT_TYPE allOne = SET_ALL_BITS_ONE; int model, *ti = tr->ti, count = ti[0], index; for(index = 4; index < count; index += 4) { unsigned int totalScore = 0; size_t pNumber = (size_t)ti[index], qNumber = (size_t)ti[index + 1], rNumber = (size_t)ti[index + 2]; for(model = 0; model < tr->NumberOfModels; model++) { size_t k, states = tr->partitionData[model].states, width = tr->partitionData[model].parsimonyLength; unsigned int i; switch(states) { case 2: { parsimonyNumber *left[2], *right[2], *this[2]; for(k = 0; k < 2; k++) { left[k] = &(tr->partitionData[model].parsVect[(width * 2 * qNumber) + width * k]); right[k] = &(tr->partitionData[model].parsVect[(width * 2 * rNumber) + width * k]); this[k] = &(tr->partitionData[model].parsVect[(width * 2 * pNumber) + width * k]); } for(i = 0; i < width; i += INTS_PER_VECTOR) { INT_TYPE s_r, s_l, v_N, l_A, l_C, v_A, v_C; s_l = VECTOR_LOAD((CAST)(&left[0][i])); s_r = VECTOR_LOAD((CAST)(&right[0][i])); l_A = VECTOR_BIT_AND(s_l, s_r); v_A = VECTOR_BIT_OR(s_l, s_r); s_l = VECTOR_LOAD((CAST)(&left[1][i])); s_r = VECTOR_LOAD((CAST)(&right[1][i])); l_C = VECTOR_BIT_AND(s_l, s_r); v_C = VECTOR_BIT_OR(s_l, s_r); v_N = VECTOR_BIT_OR(l_A, l_C); VECTOR_STORE((CAST)(&this[0][i]), VECTOR_BIT_OR(l_A, VECTOR_AND_NOT(v_N, v_A))); VECTOR_STORE((CAST)(&this[1][i]), VECTOR_BIT_OR(l_C, VECTOR_AND_NOT(v_N, v_C))); v_N = VECTOR_AND_NOT(v_N, allOne); totalScore += populationCount(v_N); } } break; case 4: { parsimonyNumber *left[4], *right[4], *this[4]; for(k = 0; k < 4; k++) { left[k] = &(tr->partitionData[model].parsVect[(width * 4 * qNumber) + width * k]); right[k] = &(tr->partitionData[model].parsVect[(width * 4 * rNumber) + width * k]); this[k] = &(tr->partitionData[model].parsVect[(width * 4 * pNumber) + width * k]); } for(i = 0; i < width; i += INTS_PER_VECTOR) { INT_TYPE s_r, s_l, v_N, l_A, l_C, l_G, l_T, v_A, v_C, v_G, v_T; s_l = VECTOR_LOAD((CAST)(&left[0][i])); s_r = VECTOR_LOAD((CAST)(&right[0][i])); l_A = VECTOR_BIT_AND(s_l, s_r); v_A = VECTOR_BIT_OR(s_l, s_r); s_l = VECTOR_LOAD((CAST)(&left[1][i])); s_r = VECTOR_LOAD((CAST)(&right[1][i])); l_C = VECTOR_BIT_AND(s_l, s_r); v_C = VECTOR_BIT_OR(s_l, s_r); s_l = VECTOR_LOAD((CAST)(&left[2][i])); s_r = VECTOR_LOAD((CAST)(&right[2][i])); l_G = VECTOR_BIT_AND(s_l, s_r); v_G = VECTOR_BIT_OR(s_l, s_r); s_l = VECTOR_LOAD((CAST)(&left[3][i])); s_r = VECTOR_LOAD((CAST)(&right[3][i])); l_T = VECTOR_BIT_AND(s_l, s_r); v_T = VECTOR_BIT_OR(s_l, s_r); v_N = VECTOR_BIT_OR(VECTOR_BIT_OR(l_A, l_C), VECTOR_BIT_OR(l_G, l_T)); VECTOR_STORE((CAST)(&this[0][i]), VECTOR_BIT_OR(l_A, VECTOR_AND_NOT(v_N, v_A))); VECTOR_STORE((CAST)(&this[1][i]), VECTOR_BIT_OR(l_C, VECTOR_AND_NOT(v_N, v_C))); VECTOR_STORE((CAST)(&this[2][i]), VECTOR_BIT_OR(l_G, VECTOR_AND_NOT(v_N, v_G))); VECTOR_STORE((CAST)(&this[3][i]), VECTOR_BIT_OR(l_T, VECTOR_AND_NOT(v_N, v_T))); v_N = VECTOR_AND_NOT(v_N, allOne); totalScore += populationCount(v_N); } } break; case 20: { parsimonyNumber *left[20], *right[20], *this[20]; for(k = 0; k < 20; k++) { left[k] = &(tr->partitionData[model].parsVect[(width * 20 * qNumber) + width * k]); right[k] = &(tr->partitionData[model].parsVect[(width * 20 * rNumber) + width * k]); this[k] = &(tr->partitionData[model].parsVect[(width * 20 * pNumber) + width * k]); } for(i = 0; i < width; i += INTS_PER_VECTOR) { size_t j; INT_TYPE s_r, s_l, v_N = SET_ALL_BITS_ZERO, l_A[20], v_A[20]; for(j = 0; j < 20; j++) { s_l = VECTOR_LOAD((CAST)(&left[j][i])); s_r = VECTOR_LOAD((CAST)(&right[j][i])); l_A[j] = VECTOR_BIT_AND(s_l, s_r); v_A[j] = VECTOR_BIT_OR(s_l, s_r); v_N = VECTOR_BIT_OR(v_N, l_A[j]); } for(j = 0; j < 20; j++) VECTOR_STORE((CAST)(&this[j][i]), VECTOR_BIT_OR(l_A[j], VECTOR_AND_NOT(v_N, v_A[j]))); v_N = VECTOR_AND_NOT(v_N, allOne); totalScore += populationCount(v_N); } } break; default: { parsimonyNumber *left[32], *right[32], *this[32]; assert(states <= 32); for(k = 0; k < states; k++) { left[k] = &(tr->partitionData[model].parsVect[(width * states * qNumber) + width * k]); right[k] = &(tr->partitionData[model].parsVect[(width * states * rNumber) + width * k]); this[k] = &(tr->partitionData[model].parsVect[(width * states * pNumber) + width * k]); } for(i = 0; i < width; i += INTS_PER_VECTOR) { size_t j; INT_TYPE s_r, s_l, v_N = SET_ALL_BITS_ZERO, l_A[32], v_A[32]; for(j = 0; j < states; j++) { s_l = VECTOR_LOAD((CAST)(&left[j][i])); s_r = VECTOR_LOAD((CAST)(&right[j][i])); l_A[j] = VECTOR_BIT_AND(s_l, s_r); v_A[j] = VECTOR_BIT_OR(s_l, s_r); v_N = VECTOR_BIT_OR(v_N, l_A[j]); } for(j = 0; j < states; j++) VECTOR_STORE((CAST)(&this[j][i]), VECTOR_BIT_OR(l_A[j], VECTOR_AND_NOT(v_N, v_A[j]))); v_N = VECTOR_AND_NOT(v_N, allOne); totalScore += populationCount(v_N); } } } } tr->parsimonyScore[pNumber] = totalScore + tr->parsimonyScore[rNumber] + tr->parsimonyScore[qNumber]; } }
bool CDecoder_OMS_fixed_SSE::decode_8bits(char Intrinsic_fix[], char Rprime_fix[], int nombre_iterations) { //////////////////////////////////////////////////////////////////////////// // // Initilisation des espaces memoire // const TYPE zero = VECTOR_ZERO; for (int i=0; i<MESSAGE; i++){ var_mesgs[i] = zero; } // //////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////// // // ENTRELACEMENT DES DONNEES D'ENTREE POUR POUVOIR EXPLOITER LE MODE SIMD // if( NOEUD%16 == 0 ){ uchar_transpose_sse((TYPE*)Intrinsic_fix, (TYPE*)var_nodes, NOEUD); }else{ char *ptrVar = (char*) var_nodes; for (int i=0; i<NOEUD; i++){ for (int z=0; z<16; z++){ ptrVar[16 * i + z] = Intrinsic_fix[z * NOEUD + i]; } } } // //////////////////////////////////////////////////////////////////////////// // unsigned int arret = 0; while ( nombre_iterations-- ) { TYPE *p_msg1r = var_mesgs; TYPE *p_msg1w = var_mesgs; #if PETIT == 1 TYPE **p_indice_nod1 = p_vn_adr; TYPE **p_indice_nod2 = p_vn_adr; #else const unsigned short *p_indice_nod1 = PosNoeudsVariable; const unsigned short *p_indice_nod2 = PosNoeudsVariable; #endif // arret = 0; const TYPE min_var = VECTOR_SET1( vSAT_NEG_VAR ); const TYPE max_msg = VECTOR_SET1( vSAT_POS_MSG ); for (int i=0; i<DEG_1_COMPUTATIONS; i++){ //IACA_START TYPE tab_vContr[DEG_1]; TYPE sign = VECTOR_ZERO; TYPE min1 = VECTOR_SET1(vSAT_POS_VAR); TYPE min2 = min1; #if (DEG_1 & 0x01) == 1 const unsigned char sign8 = 0x80; const unsigned char isign8 = 0xC0; const TYPE msign8 = VECTOR_SET1( sign8 ); const TYPE misign8 = VECTOR_SET1( isign8 ); #else const unsigned char sign8 = 0x80; const unsigned char isign8b = 0x40; const TYPE msign8 = VECTOR_SET1( sign8 ); const TYPE misign8b = VECTOR_SET1( isign8b ); #endif #if PETIT == 1 #if MANUAL_PREFETCH == 1 _mm_prefetch((const char*)(p_indice_nod1[DEG_1]), _MM_HINT_T0); _mm_prefetch((const char*)(&p_msg1r[DEG_1]), _MM_HINT_T0); #endif #endif #pragma unroll(DEG_1) for(int j=0; j<DEG_1; j++){ #if PETIT == 1 TYPE vNoeud = VECTOR_LOAD( *p_indice_nod1 ); #else TYPE vNoeud = VECTOR_LOAD(&var_nodes[(*p_indice_nod1)]); #endif TYPE vMessg = VECTOR_LOAD(p_msg1r); TYPE vContr = VECTOR_SUB_AND_SATURATE_VAR_8bits(vNoeud, vMessg, min_var); TYPE cSign = VECTOR_GET_SIGN_BIT(vContr, msign8); sign = VECTOR_XOR(sign, cSign); TYPE vAbs = VECTOR_MIN( VECTOR_ABS( vContr), max_msg); tab_vContr[j] = vContr; TYPE vTemp = min1; min1 = VECTOR_MIN_1(vAbs, min1); min2 = VECTOR_MIN_2(vAbs, vTemp, min2); p_indice_nod1 += 1; p_msg1r += 1; } #if PETIT == 1 #if MANUAL_PREFETCH == 1 for(int j=0 ; j<DEG_1 ; j++){ _mm_prefetch((const char*)(p_indice_nod1[j]), _MM_HINT_T0); } _mm_prefetch((const char*)(p_indice_nod1[DEG_1]), _MM_HINT_T0); #endif #endif TYPE cste_1 = VECTOR_MIN(VECTOR_SBU(min2, VECTOR_SET1(offset)), max_msg); // ON SATURE DIREECTEMENT AU FORMAT MSG TYPE cste_2 = VECTOR_MIN(VECTOR_SBU(min1, VECTOR_SET1(offset)), max_msg); // ON SATURE DIREECTEMENT AU FORMAT MSG #if (DEG_1 & 0x01) == 1 sign = VECTOR_XOR(sign, misign8); #else sign = VECTOR_XOR(sign, misign8b); #endif #pragma unroll(DEG_1) for(int j=0 ; j<DEG_1 ; j++) { TYPE vContr = tab_vContr[j]; TYPE vAbs = VECTOR_MIN(VECTOR_ABS(vContr), max_msg ); TYPE vRes = VECTOR_CMOV (vAbs, min1, cste_1, cste_2); TYPE vSig = VECTOR_XOR (sign, VECTOR_GET_SIGN_BIT(vContr, msign8)); TYPE v2St = VECTOR_invSIGN2(vRes, vSig); TYPE v2Sr = VECTOR_ADD_AND_SATURATE_VAR_8bits(vContr, v2St, min_var); VECTOR_STORE( p_msg1w, v2St); #if PETIT == 1 VECTOR_STORE( *p_indice_nod2, v2Sr); #else VECTOR_STORE( &var_nodes[(*p_indice_nod2)], v2Sr); #endif p_msg1w += 1; p_indice_nod2 += 1; } // arret = arret || VECTOR_XOR_REDUCE( sign ); //IACA_END } ///////////////////////////////////////////////////////////////////////////////// #if NB_DEGRES >= 2 for (int i=0; i<DEG_2_COMPUTATIONS; i++){ #if (DEG_2 & 0x01) == 1 const unsigned char sign8 = 0x80; const unsigned char isign8 = 0xC0; const TYPE msign8 = VECTOR_SET1( sign8 ); const TYPE misign8 = VECTOR_SET1( isign8 ); #else const unsigned char sign8 = 0x80; const unsigned char isign8b = 0x40; const TYPE msign8 = VECTOR_SET1( sign8 ); const TYPE misign8b = VECTOR_SET1( isign8b ); #endif TYPE tab_vContr[DEG_2]; TYPE sign = zero; TYPE min1 = VECTOR_SET1(vSAT_POS_VAR); TYPE min2 = min1; #pragma unroll(DEG_2) for(int j=0 ; j<DEG_2 ; j++) { #if PETIT == 1 TYPE vNoeud = VECTOR_LOAD( *p_indice_nod1 ); #else TYPE vNoeud = VECTOR_LOAD(&var_nodes[(*p_indice_nod1)]); #endif TYPE vMessg = VECTOR_LOAD( p_msg1r ); TYPE vContr = VECTOR_SUB_AND_SATURATE_VAR_8bits(vNoeud, vMessg, min_var); TYPE cSign = VECTOR_GET_SIGN_BIT(vContr, msign8); sign = VECTOR_XOR (sign, cSign); TYPE vAbs = VECTOR_ABS ( VECTOR_MIN(vContr, max_msg) ); tab_vContr[j] = vContr; TYPE vTemp = min1; min1 = VECTOR_MIN_1(vAbs, min1 ); min2 = VECTOR_MIN_2(vAbs, vTemp, min2); p_indice_nod1 += 1; p_msg1r += 1; } TYPE cste_1 = VECTOR_MIN( VECTOR_SBU(min2, VECTOR_SET1(offset)), max_msg); // ON SATURE DIREECTEMENT AU FORMAT MSG TYPE cste_2 = VECTOR_MIN( VECTOR_SBU(min1, VECTOR_SET1(offset)), max_msg); // ON SATURE DIREECTEMENT AU FORMAT MSG #if (DEG_2 & 0x01) == 1 sign = VECTOR_XOR(sign, misign8); #else sign = VECTOR_XOR(sign, misign8b); #endif #pragma unroll(DEG_2) for(int j=0 ; j<DEG_2 ; j++) { TYPE vContr = tab_vContr[j]; TYPE vAbs = VECTOR_ABS ( VECTOR_MIN(vContr, max_msg) ); TYPE vRes = VECTOR_CMOV (vAbs, min1, cste_1, cste_2); TYPE vSig = VECTOR_XOR (sign, VECTOR_GET_SIGN_BIT(vContr, msign8)); TYPE v2St = VECTOR_invSIGN2(vRes, vSig); TYPE v2Sr = VECTOR_ADD_AND_SATURATE_VAR_8bits(vContr, v2St, min_var); VECTOR_STORE( p_msg1w, v2St); #if PETIT == 1 VECTOR_STORE( *p_indice_nod2, v2Sr); #else VECTOR_STORE( &var_nodes[(*p_indice_nod2)], v2Sr); #endif p_msg1w += 1; p_indice_nod2 += 1; } // arret = arret || VECTOR_XOR_REDUCE( sign ); } #endif ///////////////////////////////////////////////////////////////////////////////// #if NB_DEGRES >= 3 for (int i=0; i<DEG_3_COMPUTATIONS; i++){ #if (DEG_3 & 0x01) == 1 const unsigned char sign8 = 0x80; const unsigned char isign8 = 0xC0; const TYPE msign8 = VECTOR_SET1( sign8 ); const TYPE misign8 = VECTOR_SET1( isign8 ); #else const unsigned char sign8 = 0x80; const unsigned char isign8b = 0x40; const TYPE msign8 = VECTOR_SET1( sign8 ); const TYPE misign8b = VECTOR_SET1( isign8b ); #endif TYPE tab_vContr[DEG_3]; TYPE sign = zero; TYPE min1 = VECTOR_SET1(vSAT_POS_VAR); TYPE min2 = min1; for(int j=0 ; j<DEG_3 ; j++) { #if PETIT == 1 TYPE vNoeud = VECTOR_LOAD( *p_indice_nod1 ); #else TYPE vNoeud = VECTOR_LOAD(&var_nodes[(*p_indice_nod1)]); #endif TYPE vMessg = VECTOR_LOAD( p_msg1r ); TYPE vContr = VECTOR_SUB_AND_SATURATE_VAR_8bits(vNoeud, vMessg, min_var); TYPE cSign = VECTOR_GET_SIGN_BIT(vContr, msign8); sign = VECTOR_XOR (sign, cSign); TYPE vAbs = VECTOR_ABS ( VECTOR_MIN(vContr, max_msg) ); tab_vContr[j] = vContr; TYPE vTemp = min1; min1 = VECTOR_MIN_1(vAbs, min1 ); min2 = VECTOR_MIN_2(vAbs, vTemp, min2); p_indice_nod1 += 1; p_msg1r += 1; } TYPE cste_1 = VECTOR_MIN( VECTOR_SBU(min2, VECTOR_SET1(offset)), max_msg); // ON SATURE DIREECTEMENT AU FORMAT MSG TYPE cste_2 = VECTOR_MIN( VECTOR_SBU(min1, VECTOR_SET1(offset)), max_msg); // ON SATURE DIREECTEMENT AU FORMAT MSG #if (DEG_3 & 0x01) == 1 sign = VECTOR_XOR(sign, misign8); #else sign = VECTOR_XOR(sign, misign8b); #endif for(int j=0 ; j<DEG_3 ; j++) { TYPE vContr = tab_vContr[j]; TYPE vAbs = VECTOR_ABS ( VECTOR_MIN(vContr, max_msg) ); TYPE vRes = VECTOR_CMOV (vAbs, min1, cste_1, cste_2); TYPE vSig = VECTOR_XOR (sign, VECTOR_GET_SIGN_BIT(vContr, msign8)); TYPE v2St = VECTOR_invSIGN2(vRes, vSig); TYPE v2Sr = VECTOR_ADD_AND_SATURATE_VAR_8bits(vContr, v2St, min_var); VECTOR_STORE( p_msg1w, v2St); #if PETIT == 1 VECTOR_STORE( *p_indice_nod2, v2Sr); #else VECTOR_STORE( &var_nodes[(*p_indice_nod2)], v2Sr); #endif p_msg1w += 1; p_indice_nod2 += 1; } // arret = arret || VECTOR_XOR_REDUCE( sign ); } #endif ///////////////////////////////////////////////////////////////////////////////// #if NB_DEGRES >= 4 for (int i=0; i<DEG_4_COMPUTATIONS; i++){ #if (DEG_4 & 0x01) == 1 const unsigned char sign8 = 0x80; const unsigned char isign8 = 0xC0; const TYPE msign8 = VECTOR_SET1( sign8 ); const TYPE misign8 = VECTOR_SET1( isign8 ); #else const unsigned char sign8 = 0x80; const unsigned char isign8b = 0x40; const TYPE msign8 = VECTOR_SET1( sign8 ); const TYPE misign8b = VECTOR_SET1( isign8b ); #endif TYPE tab_vContr[DEG_4]; TYPE sign = zero; TYPE min1 = VECTOR_SET1(vSAT_POS_VAR); TYPE min2 = min1; for(int j=0 ; j<DEG_4 ; j++) { #if PETIT == 1 TYPE vNoeud = VECTOR_LOAD( *p_indice_nod1 ); #else TYPE vNoeud = VECTOR_LOAD(&var_nodes[(*p_indice_nod1)]); #endif TYPE vMessg = VECTOR_LOAD( p_msg1r ); TYPE vContr = VECTOR_SUB_AND_SATURATE_VAR_8bits(vNoeud, vMessg, min_var); TYPE cSign = VECTOR_GET_SIGN_BIT(vContr, msign8); sign = VECTOR_XOR (sign, cSign); TYPE vAbs = VECTOR_ABS ( VECTOR_MIN(vContr, max_msg) ); tab_vContr[j] = vContr; TYPE vTemp = min1; min1 = VECTOR_MIN_1(vAbs, min1 ); min2 = VECTOR_MIN_2(vAbs, vTemp, min2); p_indice_nod1 += 1; p_msg1r += 1; } TYPE cste_1 = VECTOR_MIN( VECTOR_SBU(min2, VECTOR_SET1(offset)), max_msg); // ON SATURE DIREECTEMENT AU FORMAT MSG TYPE cste_2 = VECTOR_MIN( VECTOR_SBU(min1, VECTOR_SET1(offset)), max_msg); // ON SATURE DIREECTEMENT AU FORMAT MSG #if (DEG_4 & 0x01) == 1 sign = VECTOR_XOR(sign, misign8); #else sign = VECTOR_XOR(sign, misign8b); #endif for(int j=0 ; j<DEG_4 ; j++) { TYPE vContr = tab_vContr[j]; TYPE vAbs = VECTOR_ABS ( VECTOR_MIN(vContr, max_msg) ); TYPE vRes = VECTOR_CMOV (vAbs, min1, cste_1, cste_2); TYPE vSig = VECTOR_XOR (sign, VECTOR_GET_SIGN_BIT(vContr, msign8)); TYPE v2St = VECTOR_invSIGN2(vRes, vSig); TYPE v2Sr = VECTOR_ADD_AND_SATURATE_VAR_8bits(vContr, v2St, min_var); VECTOR_STORE( p_msg1w, v2St); #if PETIT == 1 VECTOR_STORE( *p_indice_nod2, v2Sr); #else VECTOR_STORE( &var_nodes[(*p_indice_nod2)], v2Sr); #endif p_msg1w += 1; p_indice_nod2 += 1; } // arret = arret || VECTOR_XOR_REDUCE( sign ); } #endif ///////////////////////////////////////////////////////////////////////////////// #if NB_DEGRES >= 5 for (int i=0; i<DEG_5_COMPUTATIONS; i++){ #if (DEG_5 & 0x01) == 1 const unsigned char sign8 = 0x80; const unsigned char isign8 = 0xC0; const TYPE msign8 = VECTOR_SET1( sign8 ); const TYPE misign8 = VECTOR_SET1( isign8 ); #else const unsigned char sign8 = 0x80; const unsigned char isign8b = 0x40; const TYPE msign8 = VECTOR_SET1( sign8 ); const TYPE misign8b = VECTOR_SET1( isign8b ); #endif TYPE tab_vContr[DEG_5]; TYPE sign = zero; TYPE min1 = VECTOR_SET1(vSAT_POS_VAR); TYPE min2 = min1; for(int j=0 ; j<DEG_5 ; j++) { #if PETIT == 1 TYPE vNoeud = VECTOR_LOAD( *p_indice_nod1 ); #else TYPE vNoeud = VECTOR_LOAD(&var_nodes[(*p_indice_nod1)]); #endif TYPE vMessg = VECTOR_LOAD( p_msg1r ); TYPE vContr = VECTOR_SUB_AND_SATURATE_VAR_8bits(vNoeud, vMessg, min_var); TYPE cSign = VECTOR_GET_SIGN_BIT(vContr, msign8); sign = VECTOR_XOR (sign, cSign); TYPE vAbs = VECTOR_ABS ( VECTOR_MIN(vContr, max_msg) ); tab_vContr[j] = vContr; TYPE vTemp = min1; min1 = VECTOR_MIN_1(vAbs, min1 ); min2 = VECTOR_MIN_2(vAbs, vTemp, min2); p_indice_nod1 += 1; p_msg1r += 1; } TYPE cste_1 = VECTOR_MIN( VECTOR_SBU(min2, VECTOR_SET1(offset)), max_msg); // ON SATURE DIREECTEMENT AU FORMAT MSG TYPE cste_2 = VECTOR_MIN( VECTOR_SBU(min1, VECTOR_SET1(offset)), max_msg); // ON SATURE DIREECTEMENT AU FORMAT MSG #if (DEG_5 & 0x01) == 1 sign = VECTOR_XOR(sign, misign8); #else sign = VECTOR_XOR(sign, misign8b); #endif for(int j=0 ; j<DEG_5 ; j++) { TYPE vContr = tab_vContr[j]; TYPE vAbs = VECTOR_ABS ( VECTOR_MIN(vContr, max_msg) ); TYPE vRes = VECTOR_CMOV (vAbs, min1, cste_1, cste_2); TYPE vSig = VECTOR_XOR (sign, VECTOR_GET_SIGN_BIT(vContr, msign8)); TYPE v2St = VECTOR_invSIGN2(vRes, vSig); TYPE v2Sr = VECTOR_ADD_AND_SATURATE_VAR_8bits(vContr, v2St, min_var); VECTOR_STORE( p_msg1w, v2St); #if PETIT == 1 VECTOR_STORE( *p_indice_nod2, v2Sr); #else VECTOR_STORE( &var_nodes[(*p_indice_nod2)], v2Sr); #endif p_msg1w += 1; p_indice_nod2 += 1; } // arret = arret || VECTOR_XOR_REDUCE( sign ); } #endif ///////////////////////////////////////////////////////////////////////////////// #if NB_DEGRES > 5 printf("The number of DEGREE(Cn) IS HIGHER THAN 5. YOU NEED TO PERFORM A COPY PASTE IN SOURCE CODE...\n"); exit( 0 ); #endif ///////////////////////////////////////////////////////////////////////////////// // // GESTION DU CRITERE D'ARRET // // if( (arret == 0) && (fast_stop == 1) ){ // break; // } } //////////////////////////////////////////////////////////////////////////// // // ON REMET EN FORME LES DONNEES DE SORTIE POUR LA SUITE DU PROCESS // if( NOEUD%16 == 0 ){ uchar_itranspose_sse((TYPE*)var_nodes, (TYPE*)Rprime_fix, NOEUD); }else{ char* ptr = (char*) var_nodes; for (int i=0; i<NOEUD; i+=1){ for (int j=0; j<16; j+=1){ Rprime_fix[j*NOEUD +i] = (ptr[16*i+j] > 0); } } } // //////////////////////////////////////////////////////////////////////////// return 1; }
bool CDecoder_OMS_fixed_NEON16_v3::decode_8bits(signed char Intrinsic_fix[], signed char Rprime_fix[], int nombre_iterations) { //////////////////////////////////////////////////////////////////////////// // // Initilisation des espaces memoire // // for (int i=0; i<MESSAGE; i++){ // var_mesgs[i] = VECTOR_ZERO; // } // //////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////// // // ENTRELACEMENT DES DONNEES D'ENTREE POUR POUVOIR EXPLOITER LE MODE SIMD // if( NOEUD%16 == 0 ){ uchar_transpose_neon((trans_TYPE*)Intrinsic_fix, (trans_TYPE*)var_nodes, NOEUD); }else{ signed char* ptrVar = (signed char*) var_nodes; for (int i=0; i<NOEUD; i++){ for (int z=0; z<16; z++){ ptrVar[16 * i + z] = Intrinsic_fix[z * NOEUD + i]; } } } // //////////////////////////////////////////////////////////////////////////// nombre_iterations--; if( 1 ) { TYPE *p_msg1w = var_mesgs; const unsigned short *p_indice_nod1 = PosNoeudsVariable; const unsigned short *p_indice_nod2 = PosNoeudsVariable; //const TYPE min_var = VECTOR_SET1( -127 ); const TYPE max_msg = VECTOR_SET1( 31 ); #if NB_DEGRES >= 1 for (int i=0; i<DEG_1_COMPUTATIONS; i++){ TYPE tab_vContr[DEG_1]; TYPE sign = VECTOR_ZERO; TYPE min1 = VECTOR_SET1(vSAT_POS_VAR); TYPE min2 = min1; #ifdef _PREFETCH_ __builtin_prefetch (p_indice_nod1 + DEG_1, 0, 3); #endif for(int j=0; j<DEG_1; j++){ TYPE vContr = VECTOR_LOAD(&var_nodes[(*p_indice_nod1)]); //#ifdef _PREFETCH_ // if( (j & 0x01) == 0 ) __builtin_prefetch (p_msg1r+DEG_1, 0, 0); //#endif TYPE cSign = VECTOR_GET_SIGN_BIT(vContr); sign = VECTOR_XOR(sign, cSign); TYPE vAbs = VECTOR_ABS( vContr ); tab_vContr[j] = vContr; TYPE vTemp = min1; min1 = VECTOR_MIN_1(vAbs, min1); min2 = VECTOR_MIN_2(vAbs, vTemp, min2); p_indice_nod1 += 1; } #ifdef _PREFETCH_ for(int j=0; j<DEG_1; j++){ __builtin_prefetch (&var_nodes[p_indice_nod1[j]], 0, 3); } #endif TYPE cste_1 = VECTOR_MIN( VECTOR_SBU(min2, VECTOR_SET1(offset)), max_msg); TYPE cste_2 = VECTOR_MIN( VECTOR_SBU(min1, VECTOR_SET1(offset)), max_msg); for(int j=0 ; j<DEG_1 ; j++) { TYPE vContr = tab_vContr[j]; TYPE vAbs = VECTOR_ABS (vContr); TYPE vRes = VECTOR_CMOV (vAbs, min1, cste_1, cste_2); vRes = VECTOR_MIN(vRes, max_msg); // BLG TYPE vSig = VECTOR_XOR (sign, VECTOR_GET_SIGN_BIT(vContr)); TYPE v2St = VECTOR_invSIGN2(vRes, vSig); TYPE v2Sr = VECTOR_ADD(vContr, v2St); VECTOR_STORE( p_msg1w, v2St); VECTOR_STORE( &var_nodes[(*p_indice_nod2)], v2Sr); p_msg1w += 1; p_indice_nod2 += 1; } } #endif ///////////////////////////////////////////////////////////////////////////////// #if NB_DEGRES >= 2 for (int i=0; i<DEG_2_COMPUTATIONS; i++){ TYPE tab_vContr[DEG_2]; TYPE sign = VECTOR_ZERO; TYPE min1 = VECTOR_SET1(vSAT_POS_VAR); TYPE min2 = min1; #ifdef _PREFETCH_ __builtin_prefetch (p_indice_nod1 + DEG_2, 0, 3); #endif for(int j=0; j<DEG_2; j++){ TYPE vContr = VECTOR_LOAD(&var_nodes[(*p_indice_nod1)]); //#ifdef _PREFETCH_ // if( (j & 0x01) == 0 ) __builtin_prefetch (p_msg1r+DEG_2, 0, 0); //#endif TYPE cSign = VECTOR_GET_SIGN_BIT(vContr); sign = VECTOR_XOR(sign, cSign); TYPE vAbs = VECTOR_ABS( vContr ); tab_vContr[j] = vContr; TYPE vTemp = min1; min1 = VECTOR_MIN_1(vAbs, min1); min2 = VECTOR_MIN_2(vAbs, vTemp, min2); p_indice_nod1 += 1; } #ifdef _PREFETCH_ for(int j=0; j<DEG_2; j++){ __builtin_prefetch (&var_nodes[p_indice_nod1[j]], 0, 3); } #endif TYPE cste_1 = VECTOR_MIN( VECTOR_SBU(min2, VECTOR_SET1(offset)), max_msg); TYPE cste_2 = VECTOR_MIN( VECTOR_SBU(min1, VECTOR_SET1(offset)), max_msg); for(int j=0 ; j<DEG_2 ; j++) { TYPE vContr = tab_vContr[j]; TYPE vAbs = VECTOR_ABS (vContr); TYPE vRes = VECTOR_CMOV (vAbs, min1, cste_1, cste_2); vRes = VECTOR_MIN(vRes, max_msg); // BLG TYPE vSig = VECTOR_XOR (sign, VECTOR_GET_SIGN_BIT(vContr)); TYPE v2St = VECTOR_invSIGN2(vRes, vSig); TYPE v2Sr = VECTOR_ADD(vContr, v2St); VECTOR_STORE( p_msg1w, v2St); VECTOR_STORE( &var_nodes[(*p_indice_nod2)], v2Sr); p_msg1w += 1; p_indice_nod2 += 1; } } #endif ///////////////////////////////////////////////////////////////////////////////// #if NB_DEGRES > 2 printf("The number of DEGREE(Cn) IS HIGHER THAN 5. YOU NEED TO PERFORM A COPY PASTE IN SOURCE CODE...\n"); exit( 0 ); #endif } // // // ON REPREND LE TRAITEMENT NORMAL DE L'INFORMATION // // while (nombre_iterations-- != 1) { TYPE *p_msg1r = var_mesgs; TYPE *p_msg1w = var_mesgs; const unsigned short *p_indice_nod1 = PosNoeudsVariable; const unsigned short *p_indice_nod2 = PosNoeudsVariable; // const TYPE min_var = VECTOR_SET1( -127 ); const TYPE max_msg = VECTOR_SET1( 31 ); #if NB_DEGRES >= 1 for (int i=0; i<DEG_1_COMPUTATIONS; i++){ TYPE tab_vContr[DEG_1]; TYPE sign = VECTOR_ZERO; TYPE min1 = VECTOR_SET1(vSAT_POS_VAR); TYPE min2 = min1; #ifdef _PREFETCH_ __builtin_prefetch (p_indice_nod1 + DEG_1, 0, 3); #endif for(int j=0; j<DEG_1; j++){ TYPE vNoeud = VECTOR_LOAD(&var_nodes[(*p_indice_nod1)]); TYPE vMessg = VECTOR_LOAD(p_msg1r); #ifdef _PREFETCH_ if( (j & 0x01) == 0 ) __builtin_prefetch (p_msg1r+DEG_1, 0, 0); #endif TYPE vContr = VECTOR_SUB(vNoeud, vMessg); TYPE cSign = VECTOR_GET_SIGN_BIT(vContr); sign = VECTOR_XOR(sign, cSign); TYPE vAbs = VECTOR_ABS( vContr ); tab_vContr[j] = vContr; TYPE vTemp = min1; min1 = VECTOR_MIN_1(vAbs, min1); min2 = VECTOR_MIN_2(vAbs, vTemp, min2); p_indice_nod1 += 1; p_msg1r += 1; } #ifdef _PREFETCH_ for(int j=0; j<DEG_1; j++){ __builtin_prefetch (&var_nodes[p_indice_nod1[j]], 0, 3); } #endif TYPE cste_1 = VECTOR_MIN( VECTOR_SBU(min2, VECTOR_SET1(offset)), max_msg); TYPE cste_2 = VECTOR_MIN( VECTOR_SBU(min1, VECTOR_SET1(offset)), max_msg); for(int j=0 ; j<DEG_1 ; j++) { TYPE vContr = tab_vContr[j]; TYPE vAbs = VECTOR_ABS (vContr); TYPE vRes = VECTOR_CMOV (vAbs, min1, cste_1, cste_2); vRes = VECTOR_MIN(vRes, max_msg); // BLG TYPE vSig = VECTOR_XOR (sign, VECTOR_GET_SIGN_BIT(vContr)); TYPE v2St = VECTOR_invSIGN2(vRes, vSig); TYPE v2Sr = VECTOR_ADD(vContr, v2St); VECTOR_STORE( p_msg1w, v2St); VECTOR_STORE( &var_nodes[(*p_indice_nod2)], v2Sr); p_msg1w += 1; p_indice_nod2 += 1; } } #endif ///////////////////////////////////////////////////////////////////////////////// #if NB_DEGRES >= 2 for (int i=0; i<DEG_2_COMPUTATIONS; i++){ TYPE tab_vContr[DEG_2]; TYPE sign = VECTOR_ZERO; TYPE min1 = VECTOR_SET1(vSAT_POS_VAR); TYPE min2 = min1; #ifdef _PREFETCH_ __builtin_prefetch (p_indice_nod1 + DEG_2, 0, 3); #endif for(int j=0; j<DEG_2; j++){ TYPE vNoeud = VECTOR_LOAD(&var_nodes[(*p_indice_nod1)]); TYPE vMessg = VECTOR_LOAD(p_msg1r); #ifdef _PREFETCH_ if( (j & 0x01) == 0 ) __builtin_prefetch (p_msg1r+DEG_2, 0, 0); #endif TYPE vContr = VECTOR_SUB(vNoeud, vMessg); TYPE cSign = VECTOR_GET_SIGN_BIT(vContr); sign = VECTOR_XOR(sign, cSign); TYPE vAbs = VECTOR_ABS( vContr ); tab_vContr[j] = vContr; TYPE vTemp = min1; min1 = VECTOR_MIN_1(vAbs, min1); min2 = VECTOR_MIN_2(vAbs, vTemp, min2); p_indice_nod1 += 1; p_msg1r += 1; } #ifdef _PREFETCH_ for(int j=0; j<DEG_2; j++){ __builtin_prefetch (&var_nodes[p_indice_nod1[j]], 0, 3); } #endif TYPE cste_1 = VECTOR_MIN( VECTOR_SBU(min2, VECTOR_SET1(offset)), max_msg); TYPE cste_2 = VECTOR_MIN( VECTOR_SBU(min1, VECTOR_SET1(offset)), max_msg); for(int j=0 ; j<DEG_2 ; j++) { TYPE vContr = tab_vContr[j]; TYPE vAbs = VECTOR_ABS (vContr); TYPE vRes = VECTOR_CMOV (vAbs, min1, cste_1, cste_2); vRes = VECTOR_MIN(vRes, max_msg); // BLG TYPE vSig = VECTOR_XOR (sign, VECTOR_GET_SIGN_BIT(vContr)); TYPE v2St = VECTOR_invSIGN2(vRes, vSig); TYPE v2Sr = VECTOR_ADD(vContr, v2St); VECTOR_STORE( p_msg1w, v2St); VECTOR_STORE( &var_nodes[(*p_indice_nod2)], v2Sr); p_msg1w += 1; p_indice_nod2 += 1; } } #endif ///////////////////////////////////////////////////////////////////////////////// #if NB_DEGRES > 2 printf("The number of DEGREE(Cn) IS HIGHER THAN 5. YOU NEED TO PERFORM A COPY PASTE IN SOURCE CODE...\n"); exit( 0 ); #endif } { TYPE *p_msg1r = var_mesgs; const unsigned short *p_indice_nod1 = PosNoeudsVariable; const unsigned short *p_indice_nod2 = PosNoeudsVariable; // const TYPE min_var = VECTOR_SET1( -127 ); const TYPE max_msg = VECTOR_SET1( 31 ); #if NB_DEGRES >= 1 for (int i=0; i<DEG_1_COMPUTATIONS; i++){ TYPE tab_vContr[DEG_1]; TYPE sign = VECTOR_ZERO; TYPE min1 = VECTOR_SET1(vSAT_POS_VAR); TYPE min2 = min1; #ifdef _PREFETCH_ __builtin_prefetch (p_indice_nod1 + DEG_1, 0, 3); #endif for(int j=0; j<DEG_1; j++){ TYPE vNoeud = VECTOR_LOAD(&var_nodes[(*p_indice_nod1)]); TYPE vMessg = VECTOR_LOAD(p_msg1r); #ifdef _PREFETCH_ if( (j & 0x01) == 0 ) __builtin_prefetch (p_msg1r+DEG_1, 0, 0); #endif TYPE vContr = VECTOR_SUB(vNoeud, vMessg); TYPE cSign = VECTOR_GET_SIGN_BIT(vContr); sign = VECTOR_XOR(sign, cSign); TYPE vAbs = VECTOR_ABS( vContr ); tab_vContr[j] = vContr; TYPE vTemp = min1; min1 = VECTOR_MIN_1(vAbs, min1); min2 = VECTOR_MIN_2(vAbs, vTemp, min2); p_indice_nod1 += 1; p_msg1r += 1; } #ifdef _PREFETCH_ for(int j=0; j<DEG_1; j++){ __builtin_prefetch (&var_nodes[p_indice_nod1[j]], 0, 3); } #endif TYPE cste_1 = VECTOR_MIN( VECTOR_SBU(min2, VECTOR_SET1(offset)), max_msg); TYPE cste_2 = VECTOR_MIN( VECTOR_SBU(min1, VECTOR_SET1(offset)), max_msg); for(int j=0 ; j<DEG_1 ; j++) { TYPE vContr = tab_vContr[j]; TYPE vAbs = VECTOR_ABS (vContr); TYPE vRes = VECTOR_CMOV (vAbs, min1, cste_1, cste_2); vRes = VECTOR_MIN(vRes, max_msg); // BLG TYPE vSig = VECTOR_XOR (sign, VECTOR_GET_SIGN_BIT(vContr)); TYPE v2St = VECTOR_invSIGN2(vRes, vSig); TYPE v2Sr = VECTOR_ADD(vContr, v2St); VECTOR_STORE( &var_nodes[(*p_indice_nod2)], v2Sr); p_indice_nod2 += 1; } } #endif ///////////////////////////////////////////////////////////////////////////////// #if NB_DEGRES >= 2 for (int i=0; i<DEG_2_COMPUTATIONS; i++){ TYPE tab_vContr[DEG_2]; TYPE sign = VECTOR_ZERO; TYPE min1 = VECTOR_SET1(vSAT_POS_VAR); TYPE min2 = min1; #ifdef _PREFETCH_ __builtin_prefetch (p_indice_nod1 + DEG_2, 0, 3); #endif for(int j=0; j<DEG_2; j++){ TYPE vNoeud = VECTOR_LOAD(&var_nodes[(*p_indice_nod1)]); TYPE vMessg = VECTOR_LOAD(p_msg1r); #ifdef _PREFETCH_ if( (j & 0x01) == 0 ) __builtin_prefetch (p_msg1r+DEG_2, 0, 0); #endif TYPE vContr = VECTOR_SUB(vNoeud, vMessg); TYPE cSign = VECTOR_GET_SIGN_BIT(vContr); sign = VECTOR_XOR(sign, cSign); TYPE vAbs = VECTOR_ABS( vContr ); tab_vContr[j] = vContr; TYPE vTemp = min1; min1 = VECTOR_MIN_1(vAbs, min1); min2 = VECTOR_MIN_2(vAbs, vTemp, min2); p_indice_nod1 += 1; p_msg1r += 1; } #ifdef _PREFETCH_ for(int j=0; j<DEG_1; j++){ __builtin_prefetch (&var_nodes[p_indice_nod1[j]], 0, 3); } #endif TYPE cste_1 = VECTOR_MIN( VECTOR_SBU(min2, VECTOR_SET1(offset)), max_msg); TYPE cste_2 = VECTOR_MIN( VECTOR_SBU(min1, VECTOR_SET1(offset)), max_msg); for(int j=0 ; j<DEG_2 ; j++) { TYPE vContr = tab_vContr[j]; TYPE vAbs = VECTOR_ABS (vContr); TYPE vRes = VECTOR_CMOV (vAbs, min1, cste_1, cste_2); vRes = VECTOR_MIN(vRes, max_msg); // BLG TYPE vSig = VECTOR_XOR (sign, VECTOR_GET_SIGN_BIT(vContr)); TYPE v2St = VECTOR_invSIGN2(vRes, vSig); TYPE v2Sr = VECTOR_ADD(vContr, v2St); VECTOR_STORE( &var_nodes[(*p_indice_nod2)], v2Sr); p_indice_nod2 += 1; } } #endif ///////////////////////////////////////////////////////////////////////////////// #if NB_DEGRES > 2 printf("The number of DEGREE(Cn) IS HIGHER THAN 5. YOU NEED TO PERFORM A COPY PASTE IN SOURCE CODE...\n"); exit( 0 ); #endif } //////////////////////////////////////////////////////////////////////////// // // ON REMET EN FORME LES DONNEES DE SORTIE POUR LA SUITE DU PROCESS // if( NOEUD%16 == 0 ){ uchar_itranspose_neon((trans_TYPE*)var_nodes, (trans_TYPE*)Rprime_fix, NOEUD); }else{ signed char* ptr = (signed char*) var_nodes; for (int i=0; i<NOEUD; i+=1){ for (int j=0; j<16; j+=1){ Rprime_fix[j*NOEUD +i] = (ptr[16*i+j] > 0); } } } // //////////////////////////////////////////////////////////////////////////// return 0; }