Exemplo n.º 1
0
void sms4_knc_encrypt_init(sms4_key_t *key)
{
	uint64_t value[sizeof(__m512i)/sizeof(uint64_t)];
	int *p = (int *)value;

	for (i = 0; i < 16; i++)
		p[i] = 0xff00;
	mask_ff00 = _mm512_load_epi32(value);

	for (i = 0; i < 16; i++)
		p[i] = 0xffff;
	mask_ffff = _mm512_load_epi32(value);

	for (i = 0; i < 16; i++)
		p[i] = 0xff0000;
	mask_ff0000 = _mm512_load_epi32(value);

	for (i = 0; i < 16; i++)
		p[i] = 0;
	vindex_0s = _mm512_load_epi32(value);

	for (i = 0; i < 16; i++)
		p[i] = 4 * i;
	vindex_4i = _mm512_load_epi32(value);

	sms4_init_sbox32();
}
 inline
 void scatter(float *ptr, const int *offsets) const
 {
     __m512i indices;
     SHORTVEC_ASSERT_ALIGNED(offsets, 64);
     indices = _mm512_load_epi32(offsets);
     _mm512_i32scatter_ps(ptr, indices, val1, 4);
     indices = _mm512_load_epi32(offsets + 16);
     _mm512_i32scatter_ps(ptr, indices, val2, 4);
 }
 inline
 void gather(const float *ptr, const int *offsets)
 {
     __m512i indices;
     SHORTVEC_ASSERT_ALIGNED(offsets, 64);
     indices = _mm512_load_epi32(offsets);
     val1    = _mm512_i32gather_ps(indices, ptr, 4);
     indices = _mm512_load_epi32(offsets + 16);
     val2    = _mm512_i32gather_ps(indices, ptr, 4);
 }
Exemplo n.º 4
0
int main()
{
   __m512d  t0,t1;

   double d1[8] __attribute__ ((aligned(64)));   
   double d2[8] __attribute__ ((aligned(64)));   
   double d3[8] __attribute__ ((aligned(64)));   

   for(int i=0; i<8; i++)
   {
       d1[i]= i*1.0;

       d2[i]= 0.0;

       d3[i] = d1[i];

   }

   //printf("testing intialization of registers\n");
   //_mm512_store_pd(d1,t0);
   //printf("d1=t0: %f %f %f %f %f %f %f %f  \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]);
   //_mm512_store_pd(d1,t1);
   //printf("d1=t1: %f %f %f %f %f %f %f %f  \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]);


   t0 = _mm512_load_pd(d1);

   printf("permute backward\n");
   t1 = (__m512d)  _mm512_permute4f128_epi32 ( (__m512i) t0, 0b00111001);
   _mm512_store_pd(d2,t1);
   printf("d1: %f %f %f %f %f %f %f %f  \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]);
   printf("d2: %f %f %f %f %f %f %f %f  \n",d2[0],d2[1],d2[2],d2[3],d2[4],d2[5],d2[6],d2[7]);
    
   printf("permute forward\n");
   t1 = (__m512d)  _mm512_permute4f128_epi32 ( (__m512i) t0, 0b10010011);
   _mm512_store_pd(d2,t1);
   printf("d1: %f %f %f %f %f %f %f %f  \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]);
   printf("d2: %f %f %f %f %f %f %f %f  \n",d2[0],d2[1],d2[2],d2[3],d2[4],d2[5],d2[6],d2[7]);
   


   int __attribute__((aligned(64))) order[16]={0,1,0,1,4,5,6,7,8,9,10,11,12,13,14,15};

   __m512i morder = _mm512_load_epi32(order);


   printf("permuting doubles\n");
   t1 = (__m512d) _mm512_permutevar_epi32 (morder, (__m512i) t0);
   _mm512_store_pd(d2,t1);
   printf("d1: %f %f %f %f %f %f %f %f  \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]);
   printf("d2: %f %f %f %f %f %f %f %f  \n",d2[0],d2[1],d2[2],d2[3],d2[4],d2[5],d2[6],d2[7]);

 
   return 0;

}
inline __m512i shiftRight(__m512i a){
	int x;
	int32_t rbuffer[64/sizeof(int32_t)] __attribute__((aligned(64)));
	// Guarda en memoria
	_mm512_store_epi32(rbuffer, a);
	// Desplaza en memoria
	for(x=64/sizeof(int32_t)-1; x>0; x--){
		rbuffer[x] = rbuffer[x-1];
	}
	rbuffer[x] = 0;
	// Carga memoria en registro y retorna
	return _mm512_load_epi32 (rbuffer);
}
inline static int16_t smith_waterman_farrar(Objeto*o, char *sec_database, int16_t long_sec_database){
	int32_t * aux_Max;
	int16_t ret_max = 0;

	__m512i vGapOpen, vGapExtend, zero;
	__m512i vF, vH, vMax, vE_j, vAux0;
	int segLen = (64 / sizeof(int32_t));
	int numSeg = (long_sec_ref + 63 / sizeof(int32_t)) / (64 / sizeof(int32_t));

		int32_t cog[segLen] __attribute__((aligned(64)));
		int32_t ceg[segLen] __attribute__((aligned(64)));
		//
		for(int x=0;x<segLen;x++) {
			cog[x] = coste_open_gap;
			ceg[x] = coste_extend_gap;
		}
		vGapOpen = _mm512_load_epi32(cog);
		vGapExtend = _mm512_load_epi32(ceg);
		zero = _mm512_xor_epi32(zero, zero);


	vMax = _mm512_xor_epi32(vMax, vMax); // vMax = <0, 0, ..., 0>

	for(int j=0; j<long_profile; j++){
		o->columnaPrevia_Max[j] = 0;
		//o->columna_Up[j] = 0;
		o->columna_Left[j] = 0;
	}
	for(int x=0; x<long_sec_database; x++){
		// vF = <0, 0, ..., 0>
		vF = _mm512_xor_epi32(vF, vF);

		// vH = vHStore[numSeg - 1] << 1
		vH = _mm512_load_epi32(o->columnaPrevia_Max + (numSeg - 1) * segLen);
		vH = shiftRight(vH);

		//
		int8_t pos_letra_database = letras[(int)(sec_database[x])];
		//printf("Letra %d %c %d\n", x, sec_database[x], pos_letra_database);
		int32_t offset = pos_letra_database * long_profile;
		int j;
		for(j=0; j<numSeg; j++){
			// vH = vH + vProfile[letra][j]
			int32_t * valor_match = profile + offset;
			offset += segLen;
			vAux0 = _mm512_load_epi32(valor_match);
			vH = _mm512_add_epi32(vH, vAux0);

			// vMax = max(vMax, vH);
			vMax = _mm512_max_epi32(vMax, vH);

			// vE[j] = max(vH, vE[j])
			// vH = max(vH, vF)
			vE_j = _mm512_load_epi32(o->columna_Left + j*segLen);
			vH = _mm512_max_epi32(vH, vE_j);
			vH = _mm512_max_epi32(vH, vF);

			// vHStore[j] = vH
			_mm512_store_epi32(o->columnaActual_Max + j*segLen, vH);

			// vAux = vH - vGapOpen
			vAux0 = _mm512_sub_epi32(vH, vGapOpen);
			vAux0 = _mm512_max_epi32(vAux0, zero);
			// vE[j] = vE[j] - vGapExtend
			vE_j = _mm512_sub_epi32(vE_j, vGapExtend);
			vE_j = _mm512_max_epi32(vE_j, zero);
			// vE[j] = max(vE[j], vAux)
			vE_j = _mm512_max_epi32(vE_j, vAux0);
			_mm512_store_epi32(o->columna_Left + j*segLen, vE_j);
			// vF = vF - vGapExtend
			vF = _mm512_sub_epi32(vF, vGapExtend);
			vF = _mm512_max_epi32(vF, zero);
			// vF = max(vF, vAux)
			vF = _mm512_max_epi32(vF, vAux0);

			// vH = vHLoad[j]
			vH = _mm512_load_epi32(o->columnaPrevia_Max + j*segLen);
		}
		// Optimización de SWAT
		/*
		for(int x=0; x<long_profile; x++){
			printf("vMax[%d]=%d\n", x, o->columnaActual_Max[x]);
		}
		printf("Numseg: %d\n", numSeg);
		displayV("F", vF);
		*/
		// vF = vF << 1
		vF = shiftRight(vF);

		j = 0;
		do { // while(AnyElement(vF > vHStore[j] - vGapOpen
			vH = _mm512_load_epi32(o->columnaActual_Max + j*segLen);
			vAux0 = _mm512_sub_epi32(vH, vGapOpen);
			vAux0 = _mm512_max_epi32(vAux0, zero);
			__mmask16 mascara = _mm512_cmpgt_epi32_mask (vF, vAux0);
			if (mascara == 0) break;
			// vHStore[j] = max(vHStore[j], vF)
			vH = _mm512_max_epi32(vH, vF);
			_mm512_store_epi32(o->columnaActual_Max + j*segLen, vH);

			// vF = vF - vGapExtend
			vF = _mm512_sub_epi32(vF, vGapExtend);
			vF = _mm512_max_epi32(vF, zero);
			if (++j >= numSeg) {
				// vF = vF << 1
				vF = shiftRight(vF);
				j = 0;
			}

		} while(1);

		//
		aux_Max = o->columnaActual_Max;
		o->columnaActual_Max = o->columnaPrevia_Max;
		o->columnaPrevia_Max = aux_Max;
		//
	}

	int32_t max[segLen] __attribute__((aligned(64)));
	_mm512_store_epi32(max, vMax);
	for(int x=1;x<segLen;x++) {
		if(max[0] < max[x]) max[0] = max[x];
	}
	if (max[0] > 32767) max[0] = 32767;
	ret_max = max[0];
	return ret_max;
}
Exemplo n.º 7
0
// Main spin update routine
__forceinline
void spinFlipCore(uint32_t* updated, const uint32_t* neighbours, const uint32_t* field,
                  unsigned int x, unsigned int y, unsigned int z, __m512i rndInt)
{
#ifdef __MIC__
  const __m512i one = _mm512_set1_epi32(0xFFFFFFFF);
  const __m512i zero = _mm512_setzero_epi32();

  // calculate indices
  unsigned int x0 = (x+N-1)%N;
  unsigned int y0 = (y+N-1)%N;
  unsigned int z0 = (z+N-1)%N;
  unsigned int x1 = (x+17)%N;
  unsigned int y1 = (y+1)%N;
  unsigned int z1 = (z+1)%N;

  // neighbour spins
  Iu32vec16 n[6];
  n[0] = _mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), neighbours + z*N*N + y*N + x0);
  n[0] = _mm512_loadunpackhi_epi32(n[0], neighbours + z*N*N + y*N + x + 16 - 1);
  n[1] = _mm512_loadunpacklo_epi32(_mm512_undefined_epi32(), neighbours + z*N*N + y*N + x + 1);
  n[1] = _mm512_loadunpackhi_epi32(n[1], neighbours + z*N*N + y*N + x1);
  n[2] = _mm512_load_epi32(neighbours + z*N*N + y0*N + x);
  n[3] = _mm512_load_epi32(neighbours + z*N*N + y1*N + x);
  n[4] = _mm512_load_epi32(neighbours + z0*N*N + y*N + x);
  n[5] = _mm512_load_epi32(neighbours + z1*N*N + y*N + x);
  
  // bits are set if spins are antiparallel
  unsigned int i = z*N*N + y*N + x;
  Iu32vec16 current = _mm512_load_epi32(updated + i);
  #pragma unroll(6)
  for(int j = 0; j < 6; ++j)
    n[j] = current ^ n[j];

  // count wrong spins using vertical counters
  Iu32vec16 c0, c1, c2, carry;

  c0 = n[0] ^ n[1];
  c1 = n[0] & n[1];
  
  c0 ^= n[2];
  c1 |= andn(c0, n[2]);
  
  c0 ^= n[3];
  carry = andn(c0, n[3]);
  c1 ^= carry;
  c2 = andn(c1, carry);
  
  c0 ^= n[4];
  carry = andn(c0, n[4]);
  c1 ^= carry;
  c2 |= andn(c1, carry);

  c0 ^= n[5];
  carry = andn(c0, n[5]);
  c1 ^= carry;
  c2 |= andn(c1, carry);
  
  Iu32vec16 w1 = andn(c2, andn(c1, c0));
  Iu32vec16 w2 = andn(c2, andn(c0, c1));
  Iu32vec16 w3 = andn(c2, c0 & c1);
  Iu32vec16 w4 = andn(c0, andn(c1, c2));
  Iu32vec16 w5 = andn(c1, c0 & c2);
  Iu32vec16 w6 = andn(c0, c1 & c2);
  
  // relation to field
  Iu32vec16 e[7];
  Iu32vec16 f = current ^ _mm512_load_epi32(field + i);
  #pragma unroll(7)
  for(int j = 0; j < 7; j++)
  {
    __mmask16 ep = _mm512_cmple_epu32_mask(rndInt, _mm512_set1_epi32(expBeta[2*j]));
    __mmask16 em = _mm512_cmple_epu32_mask(rndInt, _mm512_set1_epi32(expBeta[2*j+1]));
    e[6-j] = _mm512_mask_mov_epi32(_mm512_mask_mov_epi32(zero, em, f), ep, one);
  }

  // check for spin flip
  Iu32vec16 flip = e[0] | e[1] & w1 | e[2] & w2 | e[3] & w3 | e[4] & w4 | e[5] & w5 | e[6] & w6;
  _mm512_store_epi32(updated + i, flip ^ current);
#endif
}