Beispiel #1
0
void auryn_vector_float_clip( auryn_vector_float * v, const float a, const float b ) {
#ifdef CODE_USE_SIMD_INSTRUCTIONS_EXPLICITLY
	#ifdef CODE_ACTIVATE_CILK_INSTRUCTIONS
	for ( NeuronID i = 0 ; i < v->size ; ++i ) {
		if ( v->data[i] < a ) {
			v->data[i] = a;
		} else 
			if ( v->data[i] > b ) 
				v->data[i] = b;
	}
	#else
	const __m128 lo = _mm_set1_ps(a);
	const __m128 hi = _mm_set1_ps(b);
	for ( float * i = v->data ; i != v->data+v->size ; i += SIMD_NUM_OF_PARALLEL_FLOAT_OPERATIONS )
	{
		__m128 chunk = sse_load( i );
		__m128 result = _mm_min_ps(chunk, hi);
		result = _mm_max_ps(result, lo);
		sse_store( i, result );
	}
	#endif /* CODE_ACTIVATE_CILK_INSTRUCTIONS */
#else
	for ( NeuronID i = 0 ; i < v->size ; ++i ) {
		if ( v->data[i] < a ) {
			v->data[i] = a;
		} else 
			if ( v->data[i] > b ) 
				v->data[i] = b;
	}
#endif
}
Beispiel #2
0
void auryn_vector_float_clip( auryn_vector_float * v, const float a ) {
#ifdef CODE_USE_SIMD_INSTRUCTIONS_EXPLICITLY
	#ifdef CODE_ACTIVATE_CILK_INSTRUCTIONS
	auryn_vector_float_clip( v, a, 1e16 );
	#else
	const __m128 lo = _mm_set1_ps(a);
	const __m128 hi = _mm_set1_ps(0.);
	for ( float * i = v->data ; i != v->data+v->size ; i += SIMD_NUM_OF_PARALLEL_FLOAT_OPERATIONS )
	{
		__m128 chunk = sse_load( i );
		__m128 result = _mm_min_ps(chunk, hi);
		result = _mm_max_ps(result, lo);
		sse_store( i, result );
	}
	#endif /* CODE_ACTIVATE_CILK_INSTRUCTIONS */
#else
	auryn_vector_float_clip( v, a, 1e16 );
#endif
}
Beispiel #3
0
void read128_with_checksum(const uint8_t *in8, v16qi *out128, v16qi *cksum, int len) {
  // Read data
  register v16qi
    X0  = sse_load(in8+16*0 ),
    X1  = sse_load(in8+16*1 ),
    X2  = sse_load(in8+16*2 ),
    X3  = sse_load(in8+16*3 ),
    X4  = sse_load(in8+16*4 ),
    X5  = sse_load(in8+16*5 ),
    X6  = sse_load(in8+16*6 ),
    X7  = sse_load(in8+16*7 ),
    X8  = sse_load(in8+16*8 ),
    X9  = sse_load(in8+16*9 ),
    X10 = sse_load(in8+16*10),
    X11 = sse_load(in8+16*11),
    X12 = sse_load(in8+16*12),
    X13 = sse_load(in8+16*13),
    X14 = sse_load(in8+16*14),
    X15 = sse_load(in8+16*15);

#define X(i) X##i

  // Update checksum
  if (cksum) {
    switch (len) {
    case 16:
      *cksum ^= X(15);
    case 15:
      *cksum ^= X(14);
    case 14:
      *cksum ^= X(13);
    case 13:
      *cksum ^= X(12);
    case 12:
      *cksum ^= X(11);
    case 11:
      *cksum ^= X(10);
    case 10:
      *cksum ^= X(9 );
    case 9:
      *cksum ^= X(8 );
    case 8:
      *cksum ^= X(7 );
    case 7:
      *cksum ^= X(6 );
    case 6:
      *cksum ^= X(5 );
    case 5:
      *cksum ^= X(4 );
    case 4:
      *cksum ^= X(3 );
    case 3:
      *cksum ^= X(2 );
    case 2:
      *cksum ^= X(1 );
    case 1:
      *cksum ^= X(0 );
    case 0:
      ;
    }
  }
  
  // Transpose matrix
#define INTERLEAVE(i,j)					     \
  do {							     \
    v16qi t1= X(i);					     \
    v16qi t2= X(j);					     \
    X(i) = sse_interleavel(t1, t2);                          \
    X(j) = sse_interleaveh(t1, t2);                          \
  } while(0)
  

  INTERLEAVE( 0,  8);
  INTERLEAVE( 1,  9);
  INTERLEAVE( 2, 10);
  INTERLEAVE( 3, 11);
  INTERLEAVE( 4, 12);
  INTERLEAVE( 5, 13);
  INTERLEAVE( 6, 14);
  INTERLEAVE( 7, 15);

  INTERLEAVE( 0,  4);
  INTERLEAVE( 1,  5);
  INTERLEAVE( 2,  6);
  INTERLEAVE( 3,  7);
  INTERLEAVE( 8, 12);
  INTERLEAVE( 9, 13);
  INTERLEAVE(10, 14);
  INTERLEAVE(11, 15);

  INTERLEAVE( 0,  2);
  INTERLEAVE( 1,  3);
  INTERLEAVE( 4,  6);
  INTERLEAVE( 5,  7);
  INTERLEAVE( 8, 10);
  INTERLEAVE( 9, 11);
  INTERLEAVE(12, 14);
  INTERLEAVE(13, 15);

  INTERLEAVE( 0,  1);
  INTERLEAVE( 2,  3);
  INTERLEAVE( 4,  5);
  INTERLEAVE( 6,  7);
  INTERLEAVE( 8,  9);
  INTERLEAVE(10, 11);
  INTERLEAVE(12, 13);
  INTERLEAVE(14, 15);

  // Write data
  out128[0 ] = X(0 );
  out128[1 ] = X(2 );
  out128[2 ] = X(4 );
  out128[3 ] = X(6 );
  out128[4 ] = X(8 );
  out128[5 ] = X(10);
  out128[6 ] = X(12);
  out128[7 ] = X(14);
  out128[8 ] = X(1 );
  out128[9 ] = X(3 );
  out128[10] = X(5 );
  out128[11] = X(7 );
  out128[12] = X(9 );
  out128[13] = X(11);
  out128[14] = X(13);
  out128[15] = X(15);

#undef X
#undef INTERLEAVE
}
Beispiel #4
0
int crypto_aead_decrypt(unsigned char *m,unsigned long long *outputmlen,
                        unsigned char *nsec,
                        const unsigned char *c,unsigned long long clen,
                        const unsigned char *ad,unsigned long long adlen,
                        const unsigned char *npub,
                        const unsigned char *k) {
  v16qi data[16];
  uint8_t buffer[16*16] = {0};
  v16qi tweakey[16*TWEAKEY_SIZE];
  *outputmlen = clen-CRYPTO_ABYTES;

  v16qi auth = CV(0);
  v16qi checksum = CV(0);

  // Associated Data
  if (adlen > 0) {
    size_t idx=0;
    tweakey_schedule(k, npub, TWEAK_AD, tweakey);
    
    for (idx=0; idx+256<adlen; idx+=16*16) {
      read128(ad+idx, data);
      encrypt_tweakey(data, tweakey);
      tweakey_increment(tweakey, idx);
      write128_checksum(data, NULL, &auth, 16);
    }

    // Final chunk
    uint8_t buffer2[16*16] = {0};
    memcpy(buffer2, ad+idx, adlen-idx);
    if ((adlen % 16) == 0) {
      tweakey_set(tweakey, (adlen-idx-1)/16, 12, TWEAK_AD_LAST_FULL);
    } else {
      tweakey_set(tweakey, (adlen-idx-1)/16, 12, TWEAK_AD_LAST_PARTIAL);
      buffer2[adlen-idx] = 0x80;
    }
    tweakey_set(tweakey, (adlen-idx-1)/16, 13, 0);
    tweakey_set(tweakey, (adlen-idx-1)/16, 14, 0);
    tweakey_set(tweakey, (adlen-idx-1)/16, 15, 0);
    read128(buffer2, data);
    encrypt_tweakey(data, tweakey);
    write128_checksum(data, NULL, &auth, (adlen-idx+15)/16);
  }

  auth ^= sse_load(c+*outputmlen);

  // Message
  size_t idx=0;
  tweakey_schedule(k, npub, TWEAK_MESSAGE, tweakey);
  for (idx=0; idx+256 < *outputmlen; idx+=256) {
    read128(c+idx, data);
    decrypt_tweakey(data, tweakey);
    tweakey_increment(tweakey, idx);
    write128_checksum(data, m+idx, &checksum, 16);
  }

  int l = *outputmlen%16? *outputmlen%16: *outputmlen? 16: 0;
  int fullblocks = (*outputmlen-l-idx)/16;

  // Final block
  // use slot fullblocks (tweak will be used for tag generation)
  tweakey_set(tweakey, fullblocks, 13, 0);
  tweakey_set(tweakey, fullblocks, 14, 0);
  tweakey_set(tweakey, fullblocks, 15, 0);

  if (*outputmlen) {
    if (*outputmlen%16) {
      tweakey_set(tweakey, fullblocks, 12, TWEAK_MESSAGE_LAST_PARTIAL);
    } else {
      tweakey_set(tweakey, fullblocks, 12, TWEAK_MESSAGE_LAST_FULL);
    }
    
    uint8_t buffer2[16*16] = {0};
    buffer2[16*fullblocks+15] = 8*l;
    read128(buffer2, data);
    
    encrypt_tweakey(data, tweakey);
    write128(data, buffer2);
    unsigned i;
    for (i=0; i<l; i++)
      m[*outputmlen-l+i] = c[*outputmlen-l+i] ^ buffer2[16*fullblocks+i];
    update_checksum(m+*outputmlen-l, &checksum, l);
  }

  // Last chunk: remaining full blocks, and checksum
  memcpy(buffer, c+idx, 16*fullblocks);
  sse_store(buffer+16*fullblocks, auth);
  if (*outputmlen%16) {
    tweakey_set(tweakey, fullblocks, 12, TWEAK_TAG_LAST_PARTIAL);
  } else {
    tweakey_set(tweakey, fullblocks, 12, TWEAK_TAG_LAST_FULL);
  }
  read128(buffer, data);
  decrypt_tweakey(data, tweakey);
  write128_checksum2(data, buffer, &checksum, fullblocks+1, fullblocks);
  memcpy(m+idx, buffer, 16*fullblocks);

  // Verify tag
  if (memcmp(&checksum, buffer+16*fullblocks, 16) != 0) {
    memset(m, 0, *outputmlen);
    return -1;
  }

  return 0;
}