Пример #1
0
void auryn_vector_float_clip( auryn_vector_float * v, const float a, const float b ) {
#ifdef CODE_USE_SIMD_INSTRUCTIONS_EXPLICITLY
	#ifdef CODE_ACTIVATE_CILK_INSTRUCTIONS
	for ( NeuronID i = 0 ; i < v->size ; ++i ) {
		if ( v->data[i] < a ) {
			v->data[i] = a;
		} else 
			if ( v->data[i] > b ) 
				v->data[i] = b;
	}
	#else
	const __m128 lo = _mm_set1_ps(a);
	const __m128 hi = _mm_set1_ps(b);
	for ( float * i = v->data ; i != v->data+v->size ; i += SIMD_NUM_OF_PARALLEL_FLOAT_OPERATIONS )
	{
		__m128 chunk = sse_load( i );
		__m128 result = _mm_min_ps(chunk, hi);
		result = _mm_max_ps(result, lo);
		sse_store( i, result );
	}
	#endif /* CODE_ACTIVATE_CILK_INSTRUCTIONS */
#else
	for ( NeuronID i = 0 ; i < v->size ; ++i ) {
		if ( v->data[i] < a ) {
			v->data[i] = a;
		} else 
			if ( v->data[i] > b ) 
				v->data[i] = b;
	}
#endif
}
Пример #2
0
void auryn_vector_float_clip( auryn_vector_float * v, const float a ) {
#ifdef CODE_USE_SIMD_INSTRUCTIONS_EXPLICITLY
	#ifdef CODE_ACTIVATE_CILK_INSTRUCTIONS
	auryn_vector_float_clip( v, a, 1e16 );
	#else
	const __m128 lo = _mm_set1_ps(a);
	const __m128 hi = _mm_set1_ps(0.);
	for ( float * i = v->data ; i != v->data+v->size ; i += SIMD_NUM_OF_PARALLEL_FLOAT_OPERATIONS )
	{
		__m128 chunk = sse_load( i );
		__m128 result = _mm_min_ps(chunk, hi);
		result = _mm_max_ps(result, lo);
		sse_store( i, result );
	}
	#endif /* CODE_ACTIVATE_CILK_INSTRUCTIONS */
#else
	auryn_vector_float_clip( v, a, 1e16 );
#endif
}
Пример #3
0
void write128_checksum2(v16qi *in128, uint8_t* out8, v16qi *cksum,
		       int mlen, int cklen) {

#define X(i) in128[8*(i%2)+(i/2)]
  
  // Transpose matrix
#define INTERLEAVE(i,j)					     \
  do {							     \
    v16qi t1= X(i);					     \
    v16qi t2= X(j);					     \
    X(i) = sse_interleavel(t1, t2);                          \
    X(j) = sse_interleaveh(t1, t2);                          \
  } while(0)
  
  INTERLEAVE( 0,  8);
  INTERLEAVE( 1,  9);
  INTERLEAVE( 2, 10);
  INTERLEAVE( 3, 11);
  INTERLEAVE( 4, 12);
  INTERLEAVE( 5, 13);
  INTERLEAVE( 6, 14);
  INTERLEAVE( 7, 15);

  INTERLEAVE( 0,  4);
  INTERLEAVE( 1,  5);
  INTERLEAVE( 2,  6);
  INTERLEAVE( 3,  7);
  INTERLEAVE( 8, 12);
  INTERLEAVE( 9, 13);
  INTERLEAVE(10, 14);
  INTERLEAVE(11, 15);

  INTERLEAVE( 0,  2);
  INTERLEAVE( 1,  3);
  INTERLEAVE( 4,  6);
  INTERLEAVE( 5,  7);
  INTERLEAVE( 8, 10);
  INTERLEAVE( 9, 11);
  INTERLEAVE(12, 14);
  INTERLEAVE(13, 15);

  INTERLEAVE( 0,  1);
  INTERLEAVE( 2,  3);
  INTERLEAVE( 4,  5);
  INTERLEAVE( 6,  7);
  INTERLEAVE( 8,  9);
  INTERLEAVE(10, 11);
  INTERLEAVE(12, 13);
  INTERLEAVE(14, 15);

  // Write data
  if (out8) {
    switch (mlen) {
    case 16:
    sse_store(out8+16*15, X(15));
    case 15:
    sse_store(out8+16*14, X(14));
    case 14:
    sse_store(out8+16*13, X(13));
    case 13:
    sse_store(out8+16*12, X(12));
    case 12:
    sse_store(out8+16*11, X(11));
    case 11:
    sse_store(out8+16*10, X(10));
    case 10:
    sse_store(out8+16*9 , X(9 ));
    case 9:
    sse_store(out8+16*8 , X(8 ));
    case 8:
    sse_store(out8+16*7 , X(7 ));
    case 7:
    sse_store(out8+16*6 , X(6 ));
    case 6:
    sse_store(out8+16*5 , X(5 ));
    case 5:
    sse_store(out8+16*4 , X(4 ));
    case 4:
    sse_store(out8+16*3 , X(3 ));
    case 3:
    sse_store(out8+16*2 , X(2 ));
    case 2:
    sse_store(out8+16*1 , X(1 ));
    case 1:
    sse_store(out8+16*0 , X(0 ));
    case 0:
      ;
    }
  }

  // Update checksum
  if (cksum) {
    switch (cklen) {
    case 16:
      *cksum ^= X(15);
    case 15:
      *cksum ^= X(14);
    case 14:
      *cksum ^= X(13);
    case 13:
      *cksum ^= X(12);
    case 12:
      *cksum ^= X(11);
    case 11:
      *cksum ^= X(10);
    case 10:
      *cksum ^= X(9 );
    case 9:
      *cksum ^= X(8 );
    case 8:
      *cksum ^= X(7 );
    case 7:
      *cksum ^= X(6 );
    case 6:
      *cksum ^= X(5 );
    case 5:
      *cksum ^= X(4 );
    case 4:
      *cksum ^= X(3 );
    case 3:
      *cksum ^= X(2 );
    case 2:
      *cksum ^= X(1 );
    case 1:
      *cksum ^= X(0 );
    case 0:
      ;
    }
  }
#undef X
#undef INTERLEAVE  
}
Пример #4
0
int crypto_aead_decrypt(unsigned char *m,unsigned long long *outputmlen,
                        unsigned char *nsec,
                        const unsigned char *c,unsigned long long clen,
                        const unsigned char *ad,unsigned long long adlen,
                        const unsigned char *npub,
                        const unsigned char *k) {
  v16qi data[16];
  uint8_t buffer[16*16] = {0};
  v16qi tweakey[16*TWEAKEY_SIZE];
  *outputmlen = clen-CRYPTO_ABYTES;

  v16qi auth = CV(0);
  v16qi checksum = CV(0);

  // Associated Data
  if (adlen > 0) {
    size_t idx=0;
    tweakey_schedule(k, npub, TWEAK_AD, tweakey);
    
    for (idx=0; idx+256<adlen; idx+=16*16) {
      read128(ad+idx, data);
      encrypt_tweakey(data, tweakey);
      tweakey_increment(tweakey, idx);
      write128_checksum(data, NULL, &auth, 16);
    }

    // Final chunk
    uint8_t buffer2[16*16] = {0};
    memcpy(buffer2, ad+idx, adlen-idx);
    if ((adlen % 16) == 0) {
      tweakey_set(tweakey, (adlen-idx-1)/16, 12, TWEAK_AD_LAST_FULL);
    } else {
      tweakey_set(tweakey, (adlen-idx-1)/16, 12, TWEAK_AD_LAST_PARTIAL);
      buffer2[adlen-idx] = 0x80;
    }
    tweakey_set(tweakey, (adlen-idx-1)/16, 13, 0);
    tweakey_set(tweakey, (adlen-idx-1)/16, 14, 0);
    tweakey_set(tweakey, (adlen-idx-1)/16, 15, 0);
    read128(buffer2, data);
    encrypt_tweakey(data, tweakey);
    write128_checksum(data, NULL, &auth, (adlen-idx+15)/16);
  }

  auth ^= sse_load(c+*outputmlen);

  // Message
  size_t idx=0;
  tweakey_schedule(k, npub, TWEAK_MESSAGE, tweakey);
  for (idx=0; idx+256 < *outputmlen; idx+=256) {
    read128(c+idx, data);
    decrypt_tweakey(data, tweakey);
    tweakey_increment(tweakey, idx);
    write128_checksum(data, m+idx, &checksum, 16);
  }

  int l = *outputmlen%16? *outputmlen%16: *outputmlen? 16: 0;
  int fullblocks = (*outputmlen-l-idx)/16;

  // Final block
  // use slot fullblocks (tweak will be used for tag generation)
  tweakey_set(tweakey, fullblocks, 13, 0);
  tweakey_set(tweakey, fullblocks, 14, 0);
  tweakey_set(tweakey, fullblocks, 15, 0);

  if (*outputmlen) {
    if (*outputmlen%16) {
      tweakey_set(tweakey, fullblocks, 12, TWEAK_MESSAGE_LAST_PARTIAL);
    } else {
      tweakey_set(tweakey, fullblocks, 12, TWEAK_MESSAGE_LAST_FULL);
    }
    
    uint8_t buffer2[16*16] = {0};
    buffer2[16*fullblocks+15] = 8*l;
    read128(buffer2, data);
    
    encrypt_tweakey(data, tweakey);
    write128(data, buffer2);
    unsigned i;
    for (i=0; i<l; i++)
      m[*outputmlen-l+i] = c[*outputmlen-l+i] ^ buffer2[16*fullblocks+i];
    update_checksum(m+*outputmlen-l, &checksum, l);
  }

  // Last chunk: remaining full blocks, and checksum
  memcpy(buffer, c+idx, 16*fullblocks);
  sse_store(buffer+16*fullblocks, auth);
  if (*outputmlen%16) {
    tweakey_set(tweakey, fullblocks, 12, TWEAK_TAG_LAST_PARTIAL);
  } else {
    tweakey_set(tweakey, fullblocks, 12, TWEAK_TAG_LAST_FULL);
  }
  read128(buffer, data);
  decrypt_tweakey(data, tweakey);
  write128_checksum2(data, buffer, &checksum, fullblocks+1, fullblocks);
  memcpy(m+idx, buffer, 16*fullblocks);

  // Verify tag
  if (memcmp(&checksum, buffer+16*fullblocks, 16) != 0) {
    memset(m, 0, *outputmlen);
    return -1;
  }

  return 0;
}