void sms4_knc_encrypt_init(sms4_key_t *key) { uint64_t value[sizeof(__m512i)/sizeof(uint64_t)]; int *p = (int *)value; for (i = 0; i < 16; i++) p[i] = 0xff00; mask_ff00 = _mm512_load_epi32(value); for (i = 0; i < 16; i++) p[i] = 0xffff; mask_ffff = _mm512_load_epi32(value); for (i = 0; i < 16; i++) p[i] = 0xff0000; mask_ff0000 = _mm512_load_epi32(value); for (i = 0; i < 16; i++) p[i] = 0; vindex_0s = _mm512_load_epi32(value); for (i = 0; i < 16; i++) p[i] = 4 * i; vindex_4i = _mm512_load_epi32(value); sms4_init_sbox32(); }
inline void scatter(float *ptr, const int *offsets) const { __m512i indices; SHORTVEC_ASSERT_ALIGNED(offsets, 64); indices = _mm512_load_epi32(offsets); _mm512_i32scatter_ps(ptr, indices, val1, 4); indices = _mm512_load_epi32(offsets + 16); _mm512_i32scatter_ps(ptr, indices, val2, 4); }
inline void gather(const float *ptr, const int *offsets) { __m512i indices; SHORTVEC_ASSERT_ALIGNED(offsets, 64); indices = _mm512_load_epi32(offsets); val1 = _mm512_i32gather_ps(indices, ptr, 4); indices = _mm512_load_epi32(offsets + 16); val2 = _mm512_i32gather_ps(indices, ptr, 4); }
int main() { __m512d t0,t1; double d1[8] __attribute__ ((aligned(64))); double d2[8] __attribute__ ((aligned(64))); double d3[8] __attribute__ ((aligned(64))); for(int i=0; i<8; i++) { d1[i]= i*1.0; d2[i]= 0.0; d3[i] = d1[i]; } //printf("testing intialization of registers\n"); //_mm512_store_pd(d1,t0); //printf("d1=t0: %f %f %f %f %f %f %f %f \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]); //_mm512_store_pd(d1,t1); //printf("d1=t1: %f %f %f %f %f %f %f %f \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]); t0 = _mm512_load_pd(d1); printf("permute backward\n"); t1 = (__m512d) _mm512_permute4f128_epi32 ( (__m512i) t0, 0b00111001); _mm512_store_pd(d2,t1); printf("d1: %f %f %f %f %f %f %f %f \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]); printf("d2: %f %f %f %f %f %f %f %f \n",d2[0],d2[1],d2[2],d2[3],d2[4],d2[5],d2[6],d2[7]); printf("permute forward\n"); t1 = (__m512d) _mm512_permute4f128_epi32 ( (__m512i) t0, 0b10010011); _mm512_store_pd(d2,t1); printf("d1: %f %f %f %f %f %f %f %f \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]); printf("d2: %f %f %f %f %f %f %f %f \n",d2[0],d2[1],d2[2],d2[3],d2[4],d2[5],d2[6],d2[7]); int __attribute__((aligned(64))) order[16]={0,1,0,1,4,5,6,7,8,9,10,11,12,13,14,15}; __m512i morder = _mm512_load_epi32(order); printf("permuting doubles\n"); t1 = (__m512d) _mm512_permutevar_epi32 (morder, (__m512i) t0); _mm512_store_pd(d2,t1); printf("d1: %f %f %f %f %f %f %f %f \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]); printf("d2: %f %f %f %f %f %f %f %f \n",d2[0],d2[1],d2[2],d2[3],d2[4],d2[5],d2[6],d2[7]); return 0; }
inline __m512i shiftRight(__m512i a){ int x; int32_t rbuffer[64/sizeof(int32_t)] __attribute__((aligned(64))); // Guarda en memoria _mm512_store_epi32(rbuffer, a); // Desplaza en memoria for(x=64/sizeof(int32_t)-1; x>0; x--){ rbuffer[x] = rbuffer[x-1]; } rbuffer[x] = 0; // Carga memoria en registro y retorna return _mm512_load_epi32 (rbuffer); }
inline static int16_t smith_waterman_farrar(Objeto*o, char *sec_database, int16_t long_sec_database){ int32_t * aux_Max; int16_t ret_max = 0; __m512i vGapOpen, vGapExtend, zero; __m512i vF, vH, vMax, vE_j, vAux0; int segLen = (64 / sizeof(int32_t)); int numSeg = (long_sec_ref + 63 / sizeof(int32_t)) / (64 / sizeof(int32_t)); int32_t cog[segLen] __attribute__((aligned(64))); int32_t ceg[segLen] __attribute__((aligned(64))); // for(int x=0;x<segLen;x++) { cog[x] = coste_open_gap; ceg[x] = coste_extend_gap; } vGapOpen = _mm512_load_epi32(cog); vGapExtend = _mm512_load_epi32(ceg); zero = _mm512_xor_epi32(zero, zero); vMax = _mm512_xor_epi32(vMax, vMax); // vMax = <0, 0, ..., 0> for(int j=0; j<long_profile; j++){ o->columnaPrevia_Max[j] = 0; //o->columna_Up[j] = 0; o->columna_Left[j] = 0; } for(int x=0; x<long_sec_database; x++){ // vF = <0, 0, ..., 0> vF = _mm512_xor_epi32(vF, vF); // vH = vHStore[numSeg - 1] << 1 vH = _mm512_load_epi32(o->columnaPrevia_Max + (numSeg - 1) * segLen); vH = shiftRight(vH); // int8_t pos_letra_database = letras[(int)(sec_database[x])]; //printf("Letra %d %c %d\n", x, sec_database[x], pos_letra_database); int32_t offset = pos_letra_database * long_profile; int j; for(j=0; j<numSeg; j++){ // vH = vH + vProfile[letra][j] int32_t * valor_match = profile + offset; offset += segLen; vAux0 = _mm512_load_epi32(valor_match); vH = _mm512_add_epi32(vH, vAux0); // vMax = max(vMax, vH); vMax = _mm512_max_epi32(vMax, vH); // vE[j] = max(vH, vE[j]) // vH = max(vH, vF) vE_j = _mm512_load_epi32(o->columna_Left + j*segLen); vH = _mm512_max_epi32(vH, vE_j); vH = _mm512_max_epi32(vH, vF); // vHStore[j] = vH _mm512_store_epi32(o->columnaActual_Max + j*segLen, vH); // vAux = vH - vGapOpen vAux0 = _mm512_sub_epi32(vH, vGapOpen); vAux0 = _mm512_max_epi32(vAux0, zero); // vE[j] = vE[j] - vGapExtend vE_j = _mm512_sub_epi32(vE_j, vGapExtend); vE_j = _mm512_max_epi32(vE_j, zero); // vE[j] = max(vE[j], vAux) vE_j = _mm512_max_epi32(vE_j, vAux0); _mm512_store_epi32(o->columna_Left + j*segLen, vE_j); // vF = vF - vGapExtend vF = _mm512_sub_epi32(vF, vGapExtend); vF = _mm512_max_epi32(vF, zero); // vF = max(vF, vAux) vF = _mm512_max_epi32(vF, vAux0); // vH = vHLoad[j] vH = _mm512_load_epi32(o->columnaPrevia_Max + j*segLen); } // Optimización de SWAT /* for(int x=0; x<long_profile; x++){ printf("vMax[%d]=%d\n", x, o->columnaActual_Max[x]); } printf("Numseg: %d\n", numSeg); displayV("F", vF); */ // vF = vF << 1 vF = shiftRight(vF); j = 0; do { // while(AnyElement(vF > vHStore[j] - vGapOpen vH = _mm512_load_epi32(o->columnaActual_Max + j*segLen); vAux0 = _mm512_sub_epi32(vH, vGapOpen); vAux0 = _mm512_max_epi32(vAux0, zero); __mmask16 mascara = _mm512_cmpgt_epi32_mask (vF, vAux0); if (mascara == 0) break; // vHStore[j] = max(vHStore[j], vF) vH = _mm512_max_epi32(vH, vF); _mm512_store_epi32(o->columnaActual_Max + j*segLen, vH); // vF = vF - vGapExtend vF = _mm512_sub_epi32(vF, vGapExtend); vF = _mm512_max_epi32(vF, zero); if (++j >= numSeg) { // vF = vF << 1 vF = shiftRight(vF); j = 0; } } while(1); // aux_Max = o->columnaActual_Max; o->columnaActual_Max = o->columnaPrevia_Max; o->columnaPrevia_Max = aux_Max; // } int32_t max[segLen] __attribute__((aligned(64))); _mm512_store_epi32(max, vMax); for(int x=1;x<segLen;x++) { if(max[0] < max[x]) max[0] = max[x]; } if (max[0] > 32767) max[0] = 32767; ret_max = max[0]; return ret_max; }
// Main spin update routine __forceinline void spinFlipCore(uint32_t* updated, const uint32_t* neighbours, const uint32_t* field, unsigned int x, unsigned int y, unsigned int z, __m512i rndInt) { #ifdef __MIC__ const __m512i one = _mm512_set1_epi32(0xFFFFFFFF); const __m512i zero = _mm512_setzero_epi32(); // calculate indices unsigned int x0 = (x+N-1)%N; unsigned int y0 = (y+N-1)%N; unsigned int z0 = (z+N-1)%N; unsigned int x1 = (x+17)%N; unsigned int y1 = (y+1)%N; unsigned int z1 = (z+1)%N; // neighbour spins Iu32vec16 n[6]; n[0] = _mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), neighbours + z*N*N + y*N + x0); n[0] = _mm512_loadunpackhi_epi32(n[0], neighbours + z*N*N + y*N + x + 16 - 1); n[1] = _mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), neighbours + z*N*N + y*N + x + 1); n[1] = _mm512_loadunpackhi_epi32(n[1], neighbours + z*N*N + y*N + x1); n[2] = _mm512_load_epi32(neighbours + z*N*N + y0*N + x); n[3] = _mm512_load_epi32(neighbours + z*N*N + y1*N + x); n[4] = _mm512_load_epi32(neighbours + z0*N*N + y*N + x); n[5] = _mm512_load_epi32(neighbours + z1*N*N + y*N + x); // bits are set if spins are antiparallel unsigned int i = z*N*N + y*N + x; Iu32vec16 current = _mm512_load_epi32(updated + i); #pragma unroll(6) for(int j = 0; j < 6; ++j) n[j] = current ^ n[j]; // count wrong spins using vertical counters Iu32vec16 c0, c1, c2, carry; c0 = n[0] ^ n[1]; c1 = n[0] & n[1]; c0 ^= n[2]; c1 |= andn(c0, n[2]); c0 ^= n[3]; carry = andn(c0, n[3]); c1 ^= carry; c2 = andn(c1, carry); c0 ^= n[4]; carry = andn(c0, n[4]); c1 ^= carry; c2 |= andn(c1, carry); c0 ^= n[5]; carry = andn(c0, n[5]); c1 ^= carry; c2 |= andn(c1, carry); Iu32vec16 w1 = andn(c2, andn(c1, c0)); Iu32vec16 w2 = andn(c2, andn(c0, c1)); Iu32vec16 w3 = andn(c2, c0 & c1); Iu32vec16 w4 = andn(c0, andn(c1, c2)); Iu32vec16 w5 = andn(c1, c0 & c2); Iu32vec16 w6 = andn(c0, c1 & c2); // relation to field Iu32vec16 e[7]; Iu32vec16 f = current ^ _mm512_load_epi32(field + i); #pragma unroll(7) for(int j = 0; j < 7; j++) { __mmask16 ep = _mm512_cmple_epu32_mask(rndInt, _mm512_set1_epi32(expBeta[2*j])); __mmask16 em = _mm512_cmple_epu32_mask(rndInt, _mm512_set1_epi32(expBeta[2*j+1])); e[6-j] = _mm512_mask_mov_epi32(_mm512_mask_mov_epi32(zero, em, f), ep, one); } // check for spin flip Iu32vec16 flip = e[0] | e[1] & w1 | e[2] & w2 | e[3] & w3 | e[4] & w4 | e[5] & w5 | e[6] & w6; _mm512_store_epi32(updated + i, flip ^ current); #endif }