unsigned int calculate_timespace(long load, struct config *config) { int i; long long now, then; unsigned int estimated = GAUGECOUNT; unsigned int rounds = 0; unsigned int timed = 0; if (config->verbose) printf("calibrating load of %lius, please wait...\n", load); now = get_time(); ROUNDS(estimated); then = get_time(); timed = (unsigned int)(then - now); for (i = 0; i < 4; i++) { rounds = (unsigned int)(load * estimated / timed); dprintf("calibrating with %u rounds\n", rounds); now = get_time(); ROUNDS(rounds); then = get_time(); timed = (unsigned int)(then - now); estimated = rounds; } if (config->verbose) printf("calibration done\n"); return estimated; }
static void camellia_do_decrypt(const u32 *subkey, u32 *io, unsigned i) { u32 il, ir, t0, t1; /* temporary variables */ /* pre whitening but absorb kw2 */ io[0] ^= SUBKEY_L(i); io[1] ^= SUBKEY_R(i); /* main iteration */ #define ROUNDS(i) ({ \ CAMELLIA_ROUNDSM(io[0], io[1], \ SUBKEY_L(i + 7), SUBKEY_R(i + 7), \ io[2], io[3], il, ir); \ CAMELLIA_ROUNDSM(io[2], io[3], \ SUBKEY_L(i + 6), SUBKEY_R(i + 6), \ io[0], io[1], il, ir); \ CAMELLIA_ROUNDSM(io[0], io[1], \ SUBKEY_L(i + 5), SUBKEY_R(i + 5), \ io[2], io[3], il, ir); \ CAMELLIA_ROUNDSM(io[2], io[3], \ SUBKEY_L(i + 4), SUBKEY_R(i + 4), \ io[0], io[1], il, ir); \ CAMELLIA_ROUNDSM(io[0], io[1], \ SUBKEY_L(i + 3), SUBKEY_R(i + 3), \ io[2], io[3], il, ir); \ CAMELLIA_ROUNDSM(io[2], io[3], \ SUBKEY_L(i + 2), SUBKEY_R(i + 2), \ io[0], io[1], il, ir); \ }) #define FLS(i) ({ \ CAMELLIA_FLS(io[0], io[1], io[2], io[3], \ SUBKEY_L(i + 1), SUBKEY_R(i + 1), \ SUBKEY_L(i + 0), SUBKEY_R(i + 0), \ t0, t1, il, ir); \ }) if (i == 32) { ROUNDS(24); FLS(24); } ROUNDS(16); FLS(16); ROUNDS(8); FLS(8); ROUNDS(0); #undef ROUNDS #undef FLS /* post whitening but kw4 */ io[2] ^= SUBKEY_L(0); io[3] ^= SUBKEY_R(0); /* NB: 0,1 should be swapped with 2,3 by caller! */ }
void AES_ecb_encrypt(block *blk, AES_KEY *aesKey) { unsigned j, rnds = ROUNDS(aesKey); const block *sched = ((block *)(aesKey->rd_key)); *blk = _mm_xor_si128(*blk, sched[0]); for (j = 1; j<rnds; ++j) *blk = _mm_aesenc_si128(*blk, sched[j]); *blk = _mm_aesenclast_si128(*blk, sched[j]); }
void AES_encryptC(block *in, block *out, AES_KEY *aesKey) { int j, rnds = ROUNDS(aesKey); const __m128i *sched = ((__m128i *)(aesKey->rd_key)); __m128i tmp = _mm_load_si128((__m128i*)in); tmp = _mm_xor_si128(tmp, sched[0]); for (j = 1; j<rnds; j++) tmp = _mm_aesenc_si128(tmp, sched[j]); tmp = _mm_aesenclast_si128(tmp, sched[j]); _mm_store_si128((__m128i*)out, tmp); }
void sms4_knc_encrypt_16blocks(sms4_key_t *key, const unsigned char *in, unsigned char *out) { int *rk = (int *)key->rk; __m512i x0, x1, x2, x3, x4; __m512i t0, t1, t2, t3, t4; GET_BLKS(x0, x1, x2, x3, in); ROUNDS(x0, x1, x2, x3, x4, ROUND); PUT_BLKS(out, x2, x3, x4, x0); }
void AES_ecb_encrypt_blks(block *blks, unsigned nblks, AES_KEY *aesKey) { unsigned i,j,rnds=ROUNDS(aesKey); const block *sched = ((block *)(aesKey->rd_key)); for (i=0; i<nblks; ++i) blks[i] =_mm_xor_si128(blks[i], sched[0]); for(j=1; j<rnds; ++j) for (i=0; i<nblks; ++i) blks[i] = _mm_aesenc_si128(blks[i], sched[j]); for (i=0; i<nblks; ++i) blks[i] =_mm_aesenclast_si128(blks[i], sched[j]); }
inline void AES_ecb_decrypt_blks(block *blks, unsigned nblks, AES_KEY *key) { unsigned i, j, rnds = ROUNDS(key); const __m128i *sched = ((__m128i *) (key->rd_key)); for (i = 0; i < nblks; ++i) blks[i] = _mm_xor_si128(blks[i], sched[0]); for (j = 1; j < rnds; ++j) for (i = 0; i < nblks; ++i) blks[i] = _mm_aesdec_si128(blks[i], sched[j]); for (i = 0; i < nblks; ++i) blks[i] = _mm_aesdeclast_si128(blks[i], sched[j]); }
inline void AES_decrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key) { int j, rnds = ROUNDS(key); const __m128i *sched = ((__m128i *) (key->rd_key)); __m128i tmp = _mm_load_si128((__m128i *) in); tmp = _mm_xor_si128(tmp, sched[0]); for (j = 1; j < rnds; j++) tmp = _mm_aesdec_si128(tmp, sched[j]); tmp = _mm_aesdeclast_si128(tmp, sched[j]); _mm_store_si128((__m128i *) out, tmp); }
inline void AES_set_decrypt_key_fast(AES_KEY *dkey, const AES_KEY *ekey) { int j = 0; int i = ROUNDS(ekey); #if (OCB_KEY_LEN == 0) dkey->rounds = i; #endif dkey->rd_key[i--] = ekey->rd_key[j++]; while (i) dkey->rd_key[i--] = _mm_aesimc_si128(ekey->rd_key[j++]); dkey->rd_key[i] = ekey->rd_key[j]; }
void AES_ecb_encrypt_chunk_in_out(block *in, block *out, unsigned nblks, AES_KEY *aesKey) { int numberOfLoops = nblks / 8; int blocksPipeLined = numberOfLoops * 8; int remainingEncrypts = nblks - blocksPipeLined; unsigned j, rnds = ROUNDS(aesKey); const block *sched = ((block *)(aesKey->rd_key)); for (int i = 0; i < numberOfLoops; i++){ out[0 + i * 8] = _mm_xor_si128(in[0 + i * 8], sched[0]); out[1 + i * 8] = _mm_xor_si128(in[1 + i * 8], sched[0]); out[2 + i * 8] = _mm_xor_si128(in[2 + i * 8], sched[0]); out[3 + i * 8] = _mm_xor_si128(in[3 + i * 8], sched[0]); out[4 + i * 8] = _mm_xor_si128(in[4 + i * 8], sched[0]); out[5 + i * 8] = _mm_xor_si128(in[5 + i * 8], sched[0]); out[6 + i * 8] = _mm_xor_si128(in[6 + i * 8], sched[0]); out[7 + i * 8] = _mm_xor_si128(in[7 + i * 8], sched[0]); for (j = 1; j < rnds; ++j){ out[0 + i * 8] = _mm_aesenc_si128(out[0 + i * 8], sched[j]); out[1 + i * 8] = _mm_aesenc_si128(out[1 + i * 8], sched[j]); out[2 + i * 8] = _mm_aesenc_si128(out[2 + i * 8], sched[j]); out[3 + i * 8] = _mm_aesenc_si128(out[3 + i * 8], sched[j]); out[4 + i * 8] = _mm_aesenc_si128(out[4 + i * 8], sched[j]); out[5 + i * 8] = _mm_aesenc_si128(out[5 + i * 8], sched[j]); out[6 + i * 8] = _mm_aesenc_si128(out[6 + i * 8], sched[j]); out[7 + i * 8] = _mm_aesenc_si128(out[7 + i * 8], sched[j]); } out[0 + i * 8] = _mm_aesenclast_si128(out[0 + i * 8], sched[j]); out[1 + i * 8] = _mm_aesenclast_si128(out[1 + i * 8], sched[j]); out[2 + i * 8] = _mm_aesenclast_si128(out[2 + i * 8], sched[j]); out[3 + i * 8] = _mm_aesenclast_si128(out[3 + i * 8], sched[j]); out[4 + i * 8] = _mm_aesenclast_si128(out[4 + i * 8], sched[j]); out[5 + i * 8] = _mm_aesenclast_si128(out[5 + i * 8], sched[j]); out[6 + i * 8] = _mm_aesenclast_si128(out[6 + i * 8], sched[j]); out[7 + i * 8] = _mm_aesenclast_si128(out[7 + i * 8], sched[j]); } for (int i = blocksPipeLined; i<blocksPipeLined + remainingEncrypts; ++i) out[i] = _mm_xor_si128(in[i], sched[0]); for (j = 1; j<rnds; ++j) for (int i = blocksPipeLined; i<blocksPipeLined + remainingEncrypts; ++i) out[i] = _mm_aesenc_si128(out[i], sched[j]); for (int i = blocksPipeLined; i<blocksPipeLined + remainingEncrypts; ++i) out[i] = _mm_aesenclast_si128(out[i], sched[j]); }
void AES_ecb_encrypt_blks_4(block *blks, AES_KEY *aesKey) { unsigned j, rnds = ROUNDS(aesKey); const block *sched = ((block *)(aesKey->rd_key)); blks[0] = _mm_xor_si128(blks[0], sched[0]); blks[1] = _mm_xor_si128(blks[1], sched[0]); blks[2] = _mm_xor_si128(blks[2], sched[0]); blks[3] = _mm_xor_si128(blks[3], sched[0]); for (j = 1; j < rnds; ++j){ blks[0] = _mm_aesenc_si128(blks[0], sched[j]); blks[1] = _mm_aesenc_si128(blks[1], sched[j]); blks[2] = _mm_aesenc_si128(blks[2], sched[j]); blks[3] = _mm_aesenc_si128(blks[3], sched[j]); } blks[0] = _mm_aesenclast_si128(blks[0], sched[j]); blks[1] = _mm_aesenclast_si128(blks[1], sched[j]); blks[2] = _mm_aesenclast_si128(blks[2], sched[j]); blks[3] = _mm_aesenclast_si128(blks[3], sched[j]); }
void sms4_encrypt(const unsigned char *in, unsigned char *out, const sms4_key_t *key) { uint32_t *rk = key->rk; uint32_t x0, x1, x2, x3, x4; x0 = GET32(in ); x1 = GET32(in + 4); x2 = GET32(in + 8); x3 = GET32(in + 12); ROUNDS(x0, x1, x2, x3, x4); PUT32(x0, out ); PUT32(x4, out + 4); PUT32(x3, out + 8); PUT32(x2, out + 12); x0 = x1 = x2 = x3 = x4 = 0; }
void AES_ecb_encrypt_blks_4_in_out(block *in, block *out, AES_KEY *aesKey) { unsigned j, rnds = ROUNDS(aesKey); const block *sched = ((block *)(aesKey->rd_key)); //block temp[4]; out[0] = _mm_xor_si128(in[0], sched[0]); out[1] = _mm_xor_si128(in[1], sched[0]); out[2] = _mm_xor_si128(in[2], sched[0]); out[3] = _mm_xor_si128(in[3], sched[0]); for (j = 1; j < rnds; ++j){ out[0] = _mm_aesenc_si128(out[0], sched[j]); out[1] = _mm_aesenc_si128(out[1], sched[j]); out[2] = _mm_aesenc_si128(out[2], sched[j]); out[3] = _mm_aesenc_si128(out[3], sched[j]); } out[0] = _mm_aesenclast_si128(out[0], sched[j]); out[1] = _mm_aesenclast_si128(out[1], sched[j]); out[2] = _mm_aesenclast_si128(out[2], sched[j]); out[3] = _mm_aesenclast_si128(out[3], sched[j]); }
void start_benchmark(struct config *config) { unsigned int _round, cycle; long long now, then; long sleep_time = 0, load_time = 0; long performance_time = 0, powersave_time = 0; unsigned int calculations; unsigned long total_time = 0, progress_time = 0; sleep_time = config->sleep; load_time = config->load; for (_round = 1; _round <= config->rounds; _round++) total_time += _round * (config->sleep + config->load); total_time *= 2; for (_round = 0; _round < config->rounds; _round++) { performance_time = 0LL; powersave_time = 0LL; show_progress(total_time, progress_time); if (set_cpufreq_governor("performance", config->cpu) != 0) return; calculations = calculate_timespace(load_time, config); if (config->verbose) printf("_round %i: doing %u cycles with %u calculations" " for %lius\n", _round + 1, config->cycles, calculations, load_time); fprintf(config->output, "%u %li %li ", _round, load_time, sleep_time); if (config->verbose) printf("avarage: %lius, rps:%li\n", load_time / calculations, 1000000 * calculations / load_time); for (cycle = 0; cycle < config->cycles; cycle++) { now = get_time(); usleep(sleep_time); ROUNDS(calculations); then = get_time(); performance_time += then - now - sleep_time; if (config->verbose) printf("performance cycle took %lius, " "sleep: %lius, " "load: %lius, rounds: %u\n", (long)(then - now), sleep_time, load_time, calculations); } fprintf(config->output, "%li ", performance_time / config->cycles); progress_time += sleep_time + load_time; show_progress(total_time, progress_time); if (set_cpufreq_governor(config->governor, config->cpu) != 0) return; for (cycle = 0; cycle < config->cycles; cycle++) { now = get_time(); usleep(sleep_time); ROUNDS(calculations); then = get_time(); powersave_time += then - now - sleep_time; if (config->verbose) printf("powersave cycle took %lius, " "sleep: %lius, " "load: %lius, rounds: %u\n", (long)(then - now), sleep_time, load_time, calculations); } progress_time += sleep_time + load_time; fprintf(config->output, "%li ", powersave_time / config->cycles); fprintf(config->output, "%.3f\n", performance_time * 100.0 / powersave_time); fflush(config->output); if (config->verbose) printf("performance is at %.2f%%\n", performance_time * 100.0 / powersave_time); sleep_time += config->sleep_step; load_time += config->load_step; } }