void channel_loop(body_t body, tail_t tail) { size_t num_loops = num_c16_blocks_ / unroll_regs_; size_t loop_tail = num_c16_blocks_ - num_loops * unroll_regs_; mov(reg_coff_s8, reg_soff); xor_(reg_coff_f32, reg_coff_f32); if (num_loops) { xor_(reg_tmp, reg_tmp); add(reg_tmp, c_in_xmm_ * unroll_regs_); Label c_loop; L(c_loop); { body(unroll_regs_); add(reg_coff_s8, c_in_xmm_ * unroll_regs_); add(reg_coff_f32, sizeof(float) * c_in_xmm_ * unroll_regs_); add(reg_tmp, c_in_xmm_ * unroll_regs_); cmp(reg_tmp, reg_coff_max); jle(c_loop); } } if (loop_tail) body(loop_tail); if (c_tail_) { add(reg_coff_s8, c_in_xmm_ * loop_tail); add(reg_coff_f32, sizeof(float) * c_in_xmm_ * loop_tail); tail(); } }
// Expand the 16-unsigned char key to 11 round keys (176 bytes) // http://en.wikipedia.org/wiki/Rijndael_key_schedule#The_key_schedule static void expand_key(unsigned char *key, unsigned char *keys) { int bytes=16; // The count of how many bytes we've created so far int i=1; // The rcon iteration value i is set to 1 int j; // For repeating the second stage 3 times unsigned char t[4]; // Temporary working area known as 't' in the Wiki article memcpy(keys,key,16); // The first 16 bytes of the expanded key are simply the encryption key while (bytes<176) { // Until we have 176 bytes of expanded key, we do the following: memcpy(t,keys+bytes-4,4); // We assign the value of the previous four bytes in the expanded key to t key_schedule_core(t, i); // We perform the key schedule core on t, with i as the rcon iteration value i++; // We increment i by 1 xor_(t,keys+bytes-16,4); // We exclusive-or t with the four-unsigned char block 16 bytes before the new expanded key. memcpy(keys+bytes,t,4); // This becomes the next 4 bytes in the expanded key bytes+=4; // Keep track of how many expanded key bytes we've added // We then do the following three times to create the next twelve bytes for (j=0;j<3;j++) { memcpy(t,keys+bytes-4,4); // We assign the value of the previous 4 bytes in the expanded key to t xor_(t,keys+bytes-16,4); // We exclusive-or t with the four-unsigned char block n bytes before memcpy(keys+bytes,t,4); // This becomes the next 4 bytes in the expanded key bytes+=4; // Keep track of how many expanded key bytes we've added } } }
void forward_avx2() { xor_(reg_soff, reg_soff); Label mb_sp_loop; L(mb_sp_loop); { channel_loop([=](size_t unroll) { // Load 32 channels (two C16_blocks) in ymm, then // split the work in half, each half splits in two // regs with 8 channels per. When down converting, // put the result in a temp register for the 1st // iteration, combine the result at 2nd iteration // and store ymm with 32 channels. // If 16 channels, do just one half and store the // result with mask. Vmm v0 = Vmm(0); Vmm v1 = Vmm(1); Vmm vscale0 = Vmm(2); Vmm vshift0 = Vmm(3); Vmm vmean0 = Vmm(4); Vmm vsqrtvar0 = Vmm(5); Vmm vscale1 = Vmm(6); Vmm vshift1 = Vmm(7); Vmm vmean1 = Vmm(8); Vmm vsqrtvar1 = Vmm(9); Vmm tmp = Vmm(10); for (size_t i = 0; i < unroll; i++) { compute_vscaleshift(vscale0, vshift0, vmean0, vsqrtvar0, i * c_in_xmm_ * sizeof(float)); compute_vscaleshift(vscale1, vshift1, vmean1, vsqrtvar1, i * c_in_xmm_ * sizeof(float) + simd_w_ * sizeof(float)); vpmovsxbd(v0, src_ptr(i*c_in_xmm_)); vpmovsxbd(v1, src_ptr(i*c_in_xmm_ + simd_w_)); vcvtdq2ps(v0, v0); vcvtdq2ps(v1, v1); uni_vfmadd213ps(v0, vscale0, vshift0); uni_vfmadd213ps(v1, vscale1, vshift1); if (with_relu_) { uni_vmaxps(v0, v0, vzero); uni_vmaxps(v1, v1, vzero); } vcvtps2dq(v0, v0); // BA vcvtps2dq(v1, v1); // DC vpackssdw(v0, v0, v1); // BA + DC -> DBCA vpermq(v0, v0, 0xD8); // DBCA -> DCBA vperm2i128(v1, v0, v0, 0x1); // DCBA -> BADC vpacksswb(v0, v0, v1); // DCBA + BADC -> badcDCBA if (i == 0 && unroll != 1) uni_vmovups(tmp, v0); else if (i == 1) { // badcDCBA + fehgHGFE -> HGFEDCBA vperm2i128(v0, v0, tmp, 0x2); } } if (unroll == 1) vmaskmovps(dst_ptr(), vbody_mask, v0); else uni_vmovups(dst_ptr(), v0); }, [=]() { // handle first 8 channels. If tail is bigger, // handle second part separately. There is no way // to get performance as one has to work with bytes // via xmm. vzeroupper kills all the perf. Xmm x0 = Xmm(0); Vmm v0 = Vmm(0); Vmm vscale0 = Vmm(1); Vmm vshift0 = Vmm(2); Vmm vmean0 = Vmm(3); Vmm vsqrtvar0 = Vmm(4); size_t tail = nstl::min(c_tail_, simd_w_); size_t num_iters = c_tail_ > simd_w_ ? 2 : 1; for (size_t i = 0; i < num_iters; i++) { if (i > 0) tail = c_tail_ - simd_w_; for (size_t tl = 0; tl < tail; tl++) vpinsrb(x0, x0, src_ptr(8*i + tl), tl); if (tail == simd_w_) compute_vscaleshift(vscale0, vshift0, vmean0, vsqrtvar0, 32*i); else compute_vscaleshift(vscale0, vshift0, vmean0, vsqrtvar0, 32*i, true); vpmovsxbd(v0, x0); vcvtdq2ps(v0, v0); uni_vfmadd213ps(v0, vscale0, vshift0); if (with_relu_) uni_vmaxps(v0, v0, vzero); vcvtps2dq(v0, v0); vpackssdw(v0, v0, vzero); vpermq(v0, v0, 0xD8); vpacksswb(v0, v0, vzero); for (size_t tl = 0; tl < tail; tl++) vpextrb(dst_ptr(8*i + tl), x0, tl); } }); add(reg_soff, reg_coff_max); cmp(reg_soff, reg_soff_max); jl(mb_sp_loop); } }
void forward_avx512() { xor_(reg_soff, reg_soff); Label mb_sp_loop; L(mb_sp_loop); { channel_loop([=](size_t unroll) { // Works with 16c times @unroll blocks simultaneously. // Each block up converts 16c, performs math and down // converts. for (size_t i = 0; i < unroll; i++) { Vmm v = Vmm(i + 0*unroll); Vmm vscale = Vmm(i + 1*unroll); Vmm vshift = Vmm(i + 2*unroll); Vmm vmean = Vmm(i + 3*unroll); Vmm vsqrtvar = Vmm(i + 4*unroll); compute_vscaleshift(vscale, vshift, vmean, vsqrtvar, i * c_in_xmm_ * sizeof(float)); vpmovsxbd(v, src_ptr(i * c_in_xmm_)); vcvtdq2ps(v, v); uni_vfmadd213ps(v, vscale, vshift); if (with_relu_) uni_vmaxps(v, v, vzero); vcvtps2dq(v, v); vpmovsdb(dst_ptr(i * c_in_xmm_), v); } }, [=]() { // There is no way to get performance as one has to // work with bytes via xmm. vzeroupper kills the perf. Xmm x = Xmm(0); Vmm v = Vmm(0); Vmm vscale = Vmm(1); Vmm vshift = Vmm(2); Vmm vmean = Vmm(3); Vmm vsqrtvar = Vmm(4); for (size_t tl = 0; tl < c_tail_; tl++) vpinsrb(x, x, src_ptr(tl), tl); compute_vscaleshift(vscale, vshift, vmean, vsqrtvar, 0, true); vpmovsxbd(v, x); vcvtdq2ps(v, v); uni_vfmadd213ps(v, vscale, vshift); if (with_relu_) uni_vmaxps(v, v, vzero); vcvtps2dq(v, v); vpmovsdb(x, v); for (size_t tl = 0; tl < c_tail_; tl++) vpextrb(dst_ptr(tl), x, tl); }); add(reg_soff, reg_coff_max); cmp(reg_soff, reg_soff_max); jl(mb_sp_loop); } }
// Xor the current cipher state by a specific round key static void xor_round_key(unsigned char *state, unsigned char *keys, int round) { xor_(state,keys+round*16,16); }
CompileOutput *Compiler::Compile(AMXRef amx) { Prepare(amx); Disassembler disasm(amx); Instruction instr; bool error = false; while (!error && disasm.Decode(instr, error)) { if (!Process(instr)) { error = true; break; } switch (instr.opcode().GetId()) { case OP_LOAD_PRI: load_pri(instr.operand()); break; case OP_LOAD_ALT: load_alt(instr.operand()); break; case OP_LOAD_S_PRI: load_s_pri(instr.operand()); break; case OP_LOAD_S_ALT: load_s_alt(instr.operand()); break; case OP_LREF_PRI: lref_pri(instr.operand()); break; case OP_LREF_ALT: lref_alt(instr.operand()); break; case OP_LREF_S_PRI: lref_s_pri(instr.operand()); break; case OP_LREF_S_ALT: lref_s_alt(instr.operand()); break; case OP_LOAD_I: load_i(); break; case OP_LODB_I: lodb_i(instr.operand()); break; case OP_CONST_PRI: const_pri(instr.operand()); break; case OP_CONST_ALT: const_alt(instr.operand()); break; case OP_ADDR_PRI: addr_pri(instr.operand()); break; case OP_ADDR_ALT: addr_alt(instr.operand()); break; case OP_STOR_PRI: stor_pri(instr.operand()); break; case OP_STOR_ALT: stor_alt(instr.operand()); break; case OP_STOR_S_PRI: stor_s_pri(instr.operand()); break; case OP_STOR_S_ALT: stor_s_alt(instr.operand()); break; case OP_SREF_PRI: sref_pri(instr.operand()); break; case OP_SREF_ALT: sref_alt(instr.operand()); break; case OP_SREF_S_PRI: sref_s_pri(instr.operand()); break; case OP_SREF_S_ALT: sref_s_alt(instr.operand()); break; case OP_STOR_I: stor_i(); break; case OP_STRB_I: strb_i(instr.operand()); break; case OP_LIDX: lidx(); break; case OP_LIDX_B: lidx_b(instr.operand()); break; case OP_IDXADDR: idxaddr(); break; case OP_IDXADDR_B: idxaddr_b(instr.operand()); break; case OP_ALIGN_PRI: align_pri(instr.operand()); break; case OP_ALIGN_ALT: align_alt(instr.operand()); break; case OP_LCTRL: lctrl(instr.operand(), instr.address() + instr.size()); break; case OP_SCTRL: sctrl(instr.operand()); break; case OP_MOVE_PRI: move_pri(); break; case OP_MOVE_ALT: move_alt(); break; case OP_XCHG: xchg(); break; case OP_PUSH_PRI: push_pri(); break; case OP_PUSH_ALT: push_alt(); break; case OP_PUSH_C: push_c(instr.operand()); break; case OP_PUSH: push(instr.operand()); break; case OP_PUSH_S: push_s(instr.operand()); break; case OP_POP_PRI: pop_pri(); break; case OP_POP_ALT: pop_alt(); break; case OP_STACK: // value stack(instr.operand()); break; case OP_HEAP: heap(instr.operand()); break; case OP_PROC: proc(); break; case OP_RET: ret(); break; case OP_RETN: retn(); break; case OP_JUMP_PRI: jump_pri(); break; case OP_CALL: case OP_JUMP: case OP_JZER: case OP_JNZ: case OP_JEQ: case OP_JNEQ: case OP_JLESS: case OP_JLEQ: case OP_JGRTR: case OP_JGEQ: case OP_JSLESS: case OP_JSLEQ: case OP_JSGRTR: case OP_JSGEQ: { cell dest = instr.operand() - reinterpret_cast<cell>(amx.code()); switch (instr.opcode().GetId()) { case OP_CALL: call(dest); break; case OP_JUMP: jump(dest); break; case OP_JZER: jzer(dest); break; case OP_JNZ: jnz(dest); break; case OP_JEQ: jeq(dest); break; case OP_JNEQ: jneq(dest); break; case OP_JLESS: jless(dest); break; case OP_JLEQ: jleq(dest); break; case OP_JGRTR: jgrtr(dest); break; case OP_JGEQ: jgeq(dest); break; case OP_JSLESS: jsless(dest); break; case OP_JSLEQ: jsleq(dest); break; case OP_JSGRTR: jsgrtr(dest); break; case OP_JSGEQ: jsgeq(dest); break; } break; } case OP_SHL: shl(); break; case OP_SHR: shr(); break; case OP_SSHR: sshr(); break; case OP_SHL_C_PRI: shl_c_pri(instr.operand()); break; case OP_SHL_C_ALT: shl_c_alt(instr.operand()); break; case OP_SHR_C_PRI: shr_c_pri(instr.operand()); break; case OP_SHR_C_ALT: shr_c_alt(instr.operand()); break; case OP_SMUL: smul(); break; case OP_SDIV: sdiv(); break; case OP_SDIV_ALT: sdiv_alt(); break; case OP_UMUL: umul(); break; case OP_UDIV: udiv(); break; case OP_UDIV_ALT: udiv_alt(); break; case OP_ADD: add(); break; case OP_SUB: sub(); break; case OP_SUB_ALT: sub_alt(); break; case OP_AND: and_(); break; case OP_OR: or_(); break; case OP_XOR: xor_(); break; case OP_NOT: not_(); break; case OP_NEG: neg(); break; case OP_INVERT: invert(); break; case OP_ADD_C: add_c(instr.operand()); break; case OP_SMUL_C: smul_c(instr.operand()); break; case OP_ZERO_PRI: zero_pri(); break; case OP_ZERO_ALT: zero_alt(); break; case OP_ZERO: zero(instr.operand()); break; case OP_ZERO_S: zero_s(instr.operand()); break; case OP_SIGN_PRI: sign_pri(); break; case OP_SIGN_ALT: sign_alt(); break; case OP_EQ: eq(); break; case OP_NEQ: neq(); break; case OP_LESS: less(); break; case OP_LEQ: leq(); break; case OP_GRTR: grtr(); break; case OP_GEQ: geq(); break; case OP_SLESS: sless(); break; case OP_SLEQ: sleq(); break; case OP_SGRTR: sgrtr(); break; case OP_SGEQ: sgeq(); break; case OP_EQ_C_PRI: eq_c_pri(instr.operand()); break; case OP_EQ_C_ALT: eq_c_alt(instr.operand()); break; case OP_INC_PRI: inc_pri(); break; case OP_INC_ALT: inc_alt(); break; case OP_INC: inc(instr.operand()); break; case OP_INC_S: inc_s(instr.operand()); break; case OP_INC_I: inc_i(); break; case OP_DEC_PRI: dec_pri(); break; case OP_DEC_ALT: dec_alt(); break; case OP_DEC: dec(instr.operand()); break; case OP_DEC_S: dec_s(instr.operand()); break; case OP_DEC_I: dec_i(); break; case OP_MOVS: movs(instr.operand()); break; case OP_CMPS: cmps(instr.operand()); break; case OP_FILL: fill(instr.operand()); break; case OP_HALT: halt(instr.operand()); break; case OP_BOUNDS: bounds(instr.operand()); break; case OP_SYSREQ_PRI: sysreq_pri(); break; case OP_SYSREQ_C: { const char *name = amx.GetNativeName(instr.operand()); if (name == 0) { error = true; } else { sysreq_c(instr.operand(), name); } break; } case OP_SYSREQ_D: { const char *name = amx.GetNativeName(amx.FindNative(instr.operand())); if (name == 0) { error = true; } else { sysreq_d(instr.operand(), name); } break; } case OP_SWITCH: switch_(CaseTable(amx, instr.operand())); break; case OP_CASETBL: casetbl(); break; case OP_SWAP_PRI: swap_pri(); break; case OP_SWAP_ALT: swap_alt(); break; case OP_PUSH_ADR: push_adr(instr.operand()); break; case OP_NOP: nop(); break; case OP_BREAK: break_(); break; default: error = true; } } if (error && error_handler_ != 0) { error_handler_->Execute(instr); } return Finish(error); }