void auryn_vector_float_clip( auryn_vector_float * v, const float a, const float b ) { #ifdef CODE_USE_SIMD_INSTRUCTIONS_EXPLICITLY #ifdef CODE_ACTIVATE_CILK_INSTRUCTIONS for ( NeuronID i = 0 ; i < v->size ; ++i ) { if ( v->data[i] < a ) { v->data[i] = a; } else if ( v->data[i] > b ) v->data[i] = b; } #else const __m128 lo = _mm_set1_ps(a); const __m128 hi = _mm_set1_ps(b); for ( float * i = v->data ; i != v->data+v->size ; i += SIMD_NUM_OF_PARALLEL_FLOAT_OPERATIONS ) { __m128 chunk = sse_load( i ); __m128 result = _mm_min_ps(chunk, hi); result = _mm_max_ps(result, lo); sse_store( i, result ); } #endif /* CODE_ACTIVATE_CILK_INSTRUCTIONS */ #else for ( NeuronID i = 0 ; i < v->size ; ++i ) { if ( v->data[i] < a ) { v->data[i] = a; } else if ( v->data[i] > b ) v->data[i] = b; } #endif }
void auryn_vector_float_clip( auryn_vector_float * v, const float a ) { #ifdef CODE_USE_SIMD_INSTRUCTIONS_EXPLICITLY #ifdef CODE_ACTIVATE_CILK_INSTRUCTIONS auryn_vector_float_clip( v, a, 1e16 ); #else const __m128 lo = _mm_set1_ps(a); const __m128 hi = _mm_set1_ps(0.); for ( float * i = v->data ; i != v->data+v->size ; i += SIMD_NUM_OF_PARALLEL_FLOAT_OPERATIONS ) { __m128 chunk = sse_load( i ); __m128 result = _mm_min_ps(chunk, hi); result = _mm_max_ps(result, lo); sse_store( i, result ); } #endif /* CODE_ACTIVATE_CILK_INSTRUCTIONS */ #else auryn_vector_float_clip( v, a, 1e16 ); #endif }
void read128_with_checksum(const uint8_t *in8, v16qi *out128, v16qi *cksum, int len) { // Read data register v16qi X0 = sse_load(in8+16*0 ), X1 = sse_load(in8+16*1 ), X2 = sse_load(in8+16*2 ), X3 = sse_load(in8+16*3 ), X4 = sse_load(in8+16*4 ), X5 = sse_load(in8+16*5 ), X6 = sse_load(in8+16*6 ), X7 = sse_load(in8+16*7 ), X8 = sse_load(in8+16*8 ), X9 = sse_load(in8+16*9 ), X10 = sse_load(in8+16*10), X11 = sse_load(in8+16*11), X12 = sse_load(in8+16*12), X13 = sse_load(in8+16*13), X14 = sse_load(in8+16*14), X15 = sse_load(in8+16*15); #define X(i) X##i // Update checksum if (cksum) { switch (len) { case 16: *cksum ^= X(15); case 15: *cksum ^= X(14); case 14: *cksum ^= X(13); case 13: *cksum ^= X(12); case 12: *cksum ^= X(11); case 11: *cksum ^= X(10); case 10: *cksum ^= X(9 ); case 9: *cksum ^= X(8 ); case 8: *cksum ^= X(7 ); case 7: *cksum ^= X(6 ); case 6: *cksum ^= X(5 ); case 5: *cksum ^= X(4 ); case 4: *cksum ^= X(3 ); case 3: *cksum ^= X(2 ); case 2: *cksum ^= X(1 ); case 1: *cksum ^= X(0 ); case 0: ; } } // Transpose matrix #define INTERLEAVE(i,j) \ do { \ v16qi t1= X(i); \ v16qi t2= X(j); \ X(i) = sse_interleavel(t1, t2); \ X(j) = sse_interleaveh(t1, t2); \ } while(0) INTERLEAVE( 0, 8); INTERLEAVE( 1, 9); INTERLEAVE( 2, 10); INTERLEAVE( 3, 11); INTERLEAVE( 4, 12); INTERLEAVE( 5, 13); INTERLEAVE( 6, 14); INTERLEAVE( 7, 15); INTERLEAVE( 0, 4); INTERLEAVE( 1, 5); INTERLEAVE( 2, 6); INTERLEAVE( 3, 7); INTERLEAVE( 8, 12); INTERLEAVE( 9, 13); INTERLEAVE(10, 14); INTERLEAVE(11, 15); INTERLEAVE( 0, 2); INTERLEAVE( 1, 3); INTERLEAVE( 4, 6); INTERLEAVE( 5, 7); INTERLEAVE( 8, 10); INTERLEAVE( 9, 11); INTERLEAVE(12, 14); INTERLEAVE(13, 15); INTERLEAVE( 0, 1); INTERLEAVE( 2, 3); INTERLEAVE( 4, 5); INTERLEAVE( 6, 7); INTERLEAVE( 8, 9); INTERLEAVE(10, 11); INTERLEAVE(12, 13); INTERLEAVE(14, 15); // Write data out128[0 ] = X(0 ); out128[1 ] = X(2 ); out128[2 ] = X(4 ); out128[3 ] = X(6 ); out128[4 ] = X(8 ); out128[5 ] = X(10); out128[6 ] = X(12); out128[7 ] = X(14); out128[8 ] = X(1 ); out128[9 ] = X(3 ); out128[10] = X(5 ); out128[11] = X(7 ); out128[12] = X(9 ); out128[13] = X(11); out128[14] = X(13); out128[15] = X(15); #undef X #undef INTERLEAVE }
int crypto_aead_decrypt(unsigned char *m,unsigned long long *outputmlen, unsigned char *nsec, const unsigned char *c,unsigned long long clen, const unsigned char *ad,unsigned long long adlen, const unsigned char *npub, const unsigned char *k) { v16qi data[16]; uint8_t buffer[16*16] = {0}; v16qi tweakey[16*TWEAKEY_SIZE]; *outputmlen = clen-CRYPTO_ABYTES; v16qi auth = CV(0); v16qi checksum = CV(0); // Associated Data if (adlen > 0) { size_t idx=0; tweakey_schedule(k, npub, TWEAK_AD, tweakey); for (idx=0; idx+256<adlen; idx+=16*16) { read128(ad+idx, data); encrypt_tweakey(data, tweakey); tweakey_increment(tweakey, idx); write128_checksum(data, NULL, &auth, 16); } // Final chunk uint8_t buffer2[16*16] = {0}; memcpy(buffer2, ad+idx, adlen-idx); if ((adlen % 16) == 0) { tweakey_set(tweakey, (adlen-idx-1)/16, 12, TWEAK_AD_LAST_FULL); } else { tweakey_set(tweakey, (adlen-idx-1)/16, 12, TWEAK_AD_LAST_PARTIAL); buffer2[adlen-idx] = 0x80; } tweakey_set(tweakey, (adlen-idx-1)/16, 13, 0); tweakey_set(tweakey, (adlen-idx-1)/16, 14, 0); tweakey_set(tweakey, (adlen-idx-1)/16, 15, 0); read128(buffer2, data); encrypt_tweakey(data, tweakey); write128_checksum(data, NULL, &auth, (adlen-idx+15)/16); } auth ^= sse_load(c+*outputmlen); // Message size_t idx=0; tweakey_schedule(k, npub, TWEAK_MESSAGE, tweakey); for (idx=0; idx+256 < *outputmlen; idx+=256) { read128(c+idx, data); decrypt_tweakey(data, tweakey); tweakey_increment(tweakey, idx); write128_checksum(data, m+idx, &checksum, 16); } int l = *outputmlen%16? *outputmlen%16: *outputmlen? 16: 0; int fullblocks = (*outputmlen-l-idx)/16; // Final block // use slot fullblocks (tweak will be used for tag generation) tweakey_set(tweakey, fullblocks, 13, 0); tweakey_set(tweakey, fullblocks, 14, 0); tweakey_set(tweakey, fullblocks, 15, 0); if (*outputmlen) { if (*outputmlen%16) { tweakey_set(tweakey, fullblocks, 12, TWEAK_MESSAGE_LAST_PARTIAL); } else { tweakey_set(tweakey, fullblocks, 12, TWEAK_MESSAGE_LAST_FULL); } uint8_t buffer2[16*16] = {0}; buffer2[16*fullblocks+15] = 8*l; read128(buffer2, data); encrypt_tweakey(data, tweakey); write128(data, buffer2); unsigned i; for (i=0; i<l; i++) m[*outputmlen-l+i] = c[*outputmlen-l+i] ^ buffer2[16*fullblocks+i]; update_checksum(m+*outputmlen-l, &checksum, l); } // Last chunk: remaining full blocks, and checksum memcpy(buffer, c+idx, 16*fullblocks); sse_store(buffer+16*fullblocks, auth); if (*outputmlen%16) { tweakey_set(tweakey, fullblocks, 12, TWEAK_TAG_LAST_PARTIAL); } else { tweakey_set(tweakey, fullblocks, 12, TWEAK_TAG_LAST_FULL); } read128(buffer, data); decrypt_tweakey(data, tweakey); write128_checksum2(data, buffer, &checksum, fullblocks+1, fullblocks); memcpy(m+idx, buffer, 16*fullblocks); // Verify tag if (memcmp(&checksum, buffer+16*fullblocks, 16) != 0) { memset(m, 0, *outputmlen); return -1; } return 0; }