void channel_loop(body_t body, tail_t tail) {
        size_t num_loops = num_c16_blocks_ / unroll_regs_;
        size_t loop_tail = num_c16_blocks_ - num_loops * unroll_regs_;

        mov(reg_coff_s8, reg_soff);
        xor_(reg_coff_f32, reg_coff_f32);
        if (num_loops) {
            xor_(reg_tmp, reg_tmp);
            add(reg_tmp, c_in_xmm_ * unroll_regs_);

            Label c_loop;
            L(c_loop); {

                body(unroll_regs_);

                add(reg_coff_s8, c_in_xmm_ * unroll_regs_);
                add(reg_coff_f32, sizeof(float) * c_in_xmm_ * unroll_regs_);
                add(reg_tmp, c_in_xmm_ * unroll_regs_);
                cmp(reg_tmp, reg_coff_max);
                jle(c_loop);
            }
        }

        if (loop_tail)
            body(loop_tail);

        if (c_tail_) {
            add(reg_coff_s8, c_in_xmm_ * loop_tail);
            add(reg_coff_f32, sizeof(float) * c_in_xmm_ * loop_tail);

            tail();
        }
    }
Exemple #2
0
// Expand the 16-unsigned char key to 11 round keys (176 bytes)
// http://en.wikipedia.org/wiki/Rijndael_key_schedule#The_key_schedule
static void expand_key(unsigned char *key, unsigned char *keys) {
  int bytes=16;             // The count of how many bytes we've created so far
  int i=1;                  // The rcon iteration value i is set to 1
  int j;                    // For repeating the second stage 3 times
  unsigned char t[4];                // Temporary working area known as 't' in the Wiki article
  memcpy(keys,key,16);      // The first 16 bytes of the expanded key are simply the encryption key

  while (bytes<176) {       // Until we have 176 bytes of expanded key, we do the following:
    memcpy(t,keys+bytes-4,4);          // We assign the value of the previous four bytes in the expanded key to t
    key_schedule_core(t, i);           // We perform the key schedule core on t, with i as the rcon iteration value
    i++;                               // We increment i by 1
    xor_(t,keys+bytes-16,4);            // We exclusive-or t with the four-unsigned char block 16 bytes before the new expanded key.
    memcpy(keys+bytes,t,4);            // This becomes the next 4 bytes in the expanded key
    bytes+=4;                          // Keep track of how many expanded key bytes we've added

    // We then do the following three times to create the next twelve bytes
    for (j=0;j<3;j++) {
      memcpy(t,keys+bytes-4,4);          // We assign the value of the previous 4 bytes in the expanded key to t
      xor_(t,keys+bytes-16,4);            // We exclusive-or t with the four-unsigned char block n bytes before
      memcpy(keys+bytes,t,4);            // This becomes the next 4 bytes in the expanded key
      bytes+=4;                          // Keep track of how many expanded key bytes we've added
    }
  }
}
    void forward_avx2() {
        xor_(reg_soff, reg_soff);
        Label mb_sp_loop;
        L(mb_sp_loop); {

            channel_loop([=](size_t unroll) {
                        // Load 32 channels (two C16_blocks) in ymm, then
                        // split the work in half, each half splits in two
                        // regs with 8 channels per. When down converting,
                        // put the result in a temp register for the 1st
                        // iteration, combine the result at 2nd iteration
                        // and store ymm with 32 channels.
                        // If 16 channels, do just one half and store the
                        // result with mask.
                        Vmm v0 = Vmm(0);
                        Vmm v1 = Vmm(1);
                        Vmm vscale0 = Vmm(2);
                        Vmm vshift0 = Vmm(3);
                        Vmm vmean0 = Vmm(4);
                        Vmm vsqrtvar0 = Vmm(5);
                        Vmm vscale1 = Vmm(6);
                        Vmm vshift1 = Vmm(7);
                        Vmm vmean1 = Vmm(8);
                        Vmm vsqrtvar1 = Vmm(9);
                        Vmm tmp = Vmm(10);

                        for (size_t i = 0; i < unroll; i++) {
                            compute_vscaleshift(vscale0, vshift0, vmean0,
                                    vsqrtvar0, i * c_in_xmm_ * sizeof(float));
                            compute_vscaleshift(vscale1, vshift1, vmean1,
                                    vsqrtvar1, i * c_in_xmm_ * sizeof(float)
                                    + simd_w_ * sizeof(float));

                            vpmovsxbd(v0, src_ptr(i*c_in_xmm_));
                            vpmovsxbd(v1, src_ptr(i*c_in_xmm_ + simd_w_));
                            vcvtdq2ps(v0, v0);
                            vcvtdq2ps(v1, v1);

                            uni_vfmadd213ps(v0, vscale0, vshift0);
                            uni_vfmadd213ps(v1, vscale1, vshift1);
                            if (with_relu_) {
                                uni_vmaxps(v0, v0, vzero);
                                uni_vmaxps(v1, v1, vzero);
                            }

                            vcvtps2dq(v0, v0); // BA
                            vcvtps2dq(v1, v1); // DC
                            vpackssdw(v0, v0, v1); // BA + DC -> DBCA
                            vpermq(v0, v0, 0xD8); // DBCA -> DCBA
                            vperm2i128(v1, v0, v0, 0x1); // DCBA -> BADC
                            vpacksswb(v0, v0, v1); // DCBA + BADC -> badcDCBA
                            if (i == 0 && unroll != 1)
                                uni_vmovups(tmp, v0);
                            else if (i == 1) {
                                // badcDCBA + fehgHGFE -> HGFEDCBA
                                vperm2i128(v0, v0, tmp, 0x2);
                            }
                        }

                        if (unroll == 1)
                            vmaskmovps(dst_ptr(), vbody_mask, v0);
                        else
                            uni_vmovups(dst_ptr(), v0);
                    },
                    [=]() {
                        // handle first 8 channels. If tail is bigger,
                        // handle second part separately. There is no way
                        // to get performance as one has to work with bytes
                        // via xmm. vzeroupper kills all the perf.
                        Xmm x0 = Xmm(0);
                        Vmm v0 = Vmm(0);
                        Vmm vscale0 = Vmm(1);
                        Vmm vshift0 = Vmm(2);
                        Vmm vmean0 = Vmm(3);
                        Vmm vsqrtvar0 = Vmm(4);

                        size_t tail = nstl::min(c_tail_, simd_w_);
                        size_t num_iters = c_tail_ > simd_w_ ? 2 : 1;

                        for (size_t i = 0; i < num_iters; i++) {
                            if (i > 0)
                                tail = c_tail_ - simd_w_;

                            for (size_t tl = 0; tl < tail; tl++)
                                vpinsrb(x0, x0, src_ptr(8*i + tl), tl);

                            if (tail == simd_w_)
                                compute_vscaleshift(vscale0, vshift0, vmean0,
                                        vsqrtvar0, 32*i);
                            else
                                compute_vscaleshift(vscale0, vshift0, vmean0,
                                        vsqrtvar0, 32*i, true);

                            vpmovsxbd(v0, x0);
                            vcvtdq2ps(v0, v0);
                            uni_vfmadd213ps(v0, vscale0, vshift0);
                            if (with_relu_)
                                uni_vmaxps(v0, v0, vzero);
                            vcvtps2dq(v0, v0);
                            vpackssdw(v0, v0, vzero);
                            vpermq(v0, v0, 0xD8);
                            vpacksswb(v0, v0, vzero);

                            for (size_t tl = 0; tl < tail; tl++)
                                vpextrb(dst_ptr(8*i + tl), x0, tl);
                        }
                    });

            add(reg_soff, reg_coff_max);
            cmp(reg_soff, reg_soff_max);
            jl(mb_sp_loop);
        }
    }
    void forward_avx512() {
        xor_(reg_soff, reg_soff);
        Label mb_sp_loop;
        L(mb_sp_loop); {

            channel_loop([=](size_t unroll) {
                        // Works with 16c times @unroll blocks simultaneously.
                        // Each block up converts 16c, performs math and down
                        // converts.
                        for (size_t i = 0; i < unroll; i++) {
                            Vmm v = Vmm(i + 0*unroll);
                            Vmm vscale = Vmm(i + 1*unroll);
                            Vmm vshift = Vmm(i + 2*unroll);
                            Vmm vmean = Vmm(i + 3*unroll);
                            Vmm vsqrtvar = Vmm(i + 4*unroll);

                            compute_vscaleshift(vscale, vshift, vmean, vsqrtvar,
                                i * c_in_xmm_ * sizeof(float));

                            vpmovsxbd(v, src_ptr(i * c_in_xmm_));
                            vcvtdq2ps(v, v);

                            uni_vfmadd213ps(v, vscale, vshift);
                            if (with_relu_)
                                uni_vmaxps(v, v, vzero);

                            vcvtps2dq(v, v);
                            vpmovsdb(dst_ptr(i * c_in_xmm_), v);
                        }
                    },
                    [=]() {
                        // There is no way to get performance as one has to
                        // work with bytes via xmm. vzeroupper kills the perf.
                        Xmm x = Xmm(0);
                        Vmm v = Vmm(0);
                        Vmm vscale = Vmm(1);
                        Vmm vshift = Vmm(2);
                        Vmm vmean = Vmm(3);
                        Vmm vsqrtvar = Vmm(4);

                        for (size_t tl = 0; tl < c_tail_; tl++)
                            vpinsrb(x, x, src_ptr(tl), tl);

                        compute_vscaleshift(vscale, vshift, vmean, vsqrtvar, 0,
                                true);

                        vpmovsxbd(v, x);
                        vcvtdq2ps(v, v);

                        uni_vfmadd213ps(v, vscale, vshift);
                        if (with_relu_)
                            uni_vmaxps(v, v, vzero);

                        vcvtps2dq(v, v);
                        vpmovsdb(x, v);

                        for (size_t tl = 0; tl < c_tail_; tl++)
                            vpextrb(dst_ptr(tl), x, tl);
                    });

            add(reg_soff, reg_coff_max);
            cmp(reg_soff, reg_soff_max);
            jl(mb_sp_loop);
        }
    }
Exemple #5
0
// Xor the current cipher state by a specific round key
static void xor_round_key(unsigned char *state, unsigned char *keys, int round) {
  xor_(state,keys+round*16,16);
}
CompileOutput *Compiler::Compile(AMXRef amx) {
  Prepare(amx);

  Disassembler disasm(amx);
  Instruction instr;
  bool error = false;

  while (!error && disasm.Decode(instr, error)) {
    if (!Process(instr)) {
      error = true;
      break;
    }

    switch (instr.opcode().GetId()) {
      case OP_LOAD_PRI:
        load_pri(instr.operand());
        break;
      case OP_LOAD_ALT:
        load_alt(instr.operand());
        break;
      case OP_LOAD_S_PRI:
        load_s_pri(instr.operand());
        break;
      case OP_LOAD_S_ALT:
        load_s_alt(instr.operand());
        break;
      case OP_LREF_PRI:
        lref_pri(instr.operand());
        break;
      case OP_LREF_ALT:
        lref_alt(instr.operand());
        break;
      case OP_LREF_S_PRI:
        lref_s_pri(instr.operand());
        break;
      case OP_LREF_S_ALT:
        lref_s_alt(instr.operand());
        break;
      case OP_LOAD_I:
        load_i();
        break;
      case OP_LODB_I:
        lodb_i(instr.operand());
        break;
      case OP_CONST_PRI:
        const_pri(instr.operand());
        break;
      case OP_CONST_ALT:
        const_alt(instr.operand());
        break;
      case OP_ADDR_PRI:
        addr_pri(instr.operand());
        break;
      case OP_ADDR_ALT:
        addr_alt(instr.operand());
        break;
      case OP_STOR_PRI:
        stor_pri(instr.operand());
        break;
      case OP_STOR_ALT:
        stor_alt(instr.operand());
        break;
      case OP_STOR_S_PRI:
        stor_s_pri(instr.operand());
        break;
      case OP_STOR_S_ALT:
        stor_s_alt(instr.operand());
        break;
      case OP_SREF_PRI:
        sref_pri(instr.operand());
        break;
      case OP_SREF_ALT:
        sref_alt(instr.operand());
        break;
      case OP_SREF_S_PRI:
        sref_s_pri(instr.operand());
        break;
      case OP_SREF_S_ALT:
        sref_s_alt(instr.operand());
        break;
      case OP_STOR_I:
        stor_i();
        break;
      case OP_STRB_I:
        strb_i(instr.operand());
        break;
      case OP_LIDX:
        lidx();
        break;
      case OP_LIDX_B:
        lidx_b(instr.operand());
        break;
      case OP_IDXADDR:
        idxaddr();
        break;
      case OP_IDXADDR_B:
        idxaddr_b(instr.operand());
        break;
      case OP_ALIGN_PRI:
        align_pri(instr.operand());
        break;
      case OP_ALIGN_ALT:
        align_alt(instr.operand());
        break;
      case OP_LCTRL:
        lctrl(instr.operand(), instr.address() + instr.size());
        break;
      case OP_SCTRL:
        sctrl(instr.operand());
        break;
      case OP_MOVE_PRI:
        move_pri();
        break;
      case OP_MOVE_ALT:
        move_alt();
        break;
      case OP_XCHG:
        xchg();
        break;
      case OP_PUSH_PRI:
        push_pri();
        break;
      case OP_PUSH_ALT:
        push_alt();
        break;
      case OP_PUSH_C:
        push_c(instr.operand());
        break;
      case OP_PUSH:
        push(instr.operand());
        break;
      case OP_PUSH_S:
        push_s(instr.operand());
        break;
      case OP_POP_PRI:
        pop_pri();
        break;
      case OP_POP_ALT:
        pop_alt();
        break;
      case OP_STACK: // value
        stack(instr.operand());
        break;
      case OP_HEAP:
        heap(instr.operand());
        break;
      case OP_PROC:
        proc();
        break;
      case OP_RET:
        ret();
        break;
      case OP_RETN:
        retn();
        break;
      case OP_JUMP_PRI:
        jump_pri();
        break;
      case OP_CALL:
      case OP_JUMP:
      case OP_JZER:
      case OP_JNZ:
      case OP_JEQ:
      case OP_JNEQ:
      case OP_JLESS:
      case OP_JLEQ:
      case OP_JGRTR:
      case OP_JGEQ:
      case OP_JSLESS:
      case OP_JSLEQ:
      case OP_JSGRTR:
      case OP_JSGEQ: {
        cell dest = instr.operand() - reinterpret_cast<cell>(amx.code());
        switch (instr.opcode().GetId()) {
          case OP_CALL:
            call(dest);
            break;
          case OP_JUMP:
            jump(dest);
            break;
          case OP_JZER:
            jzer(dest);
            break;
          case OP_JNZ:
            jnz(dest);
            break;
          case OP_JEQ:
            jeq(dest);
            break;
          case OP_JNEQ:
            jneq(dest);
            break;
          case OP_JLESS:
            jless(dest);
            break;
          case OP_JLEQ:
            jleq(dest);
            break;
          case OP_JGRTR:
            jgrtr(dest);
            break;
          case OP_JGEQ:
            jgeq(dest);
            break;
          case OP_JSLESS:
            jsless(dest);
            break;
          case OP_JSLEQ:
            jsleq(dest);
            break;
          case OP_JSGRTR:
            jsgrtr(dest);
            break;
          case OP_JSGEQ:
            jsgeq(dest);
            break;
        }
        break;
      }
      case OP_SHL:
        shl();
        break;
      case OP_SHR:
        shr();
        break;
      case OP_SSHR:
        sshr();
        break;
      case OP_SHL_C_PRI:
        shl_c_pri(instr.operand());
        break;
      case OP_SHL_C_ALT:
        shl_c_alt(instr.operand());
        break;
      case OP_SHR_C_PRI:
        shr_c_pri(instr.operand());
        break;
      case OP_SHR_C_ALT:
        shr_c_alt(instr.operand());
        break;
      case OP_SMUL:
        smul();
        break;
      case OP_SDIV:
        sdiv();
        break;
      case OP_SDIV_ALT:
        sdiv_alt();
        break;
      case OP_UMUL:
        umul();
        break;
      case OP_UDIV:
        udiv();
        break;
      case OP_UDIV_ALT:
        udiv_alt();
        break;
      case OP_ADD:
        add();
        break;
      case OP_SUB:
        sub();
        break;
      case OP_SUB_ALT:
        sub_alt();
        break;
      case OP_AND:
        and_();
        break;
      case OP_OR:
        or_();
        break;
      case OP_XOR:
        xor_();
        break;
      case OP_NOT:
        not_();
        break;
      case OP_NEG:
        neg();
        break;
      case OP_INVERT:
        invert();
        break;
      case OP_ADD_C:
        add_c(instr.operand());
        break;
      case OP_SMUL_C:
        smul_c(instr.operand());
        break;
      case OP_ZERO_PRI:
        zero_pri();
        break;
      case OP_ZERO_ALT:
        zero_alt();
        break;
      case OP_ZERO:
        zero(instr.operand());
        break;
      case OP_ZERO_S:
        zero_s(instr.operand());
        break;
      case OP_SIGN_PRI:
        sign_pri();
        break;
      case OP_SIGN_ALT:
        sign_alt();
        break;
      case OP_EQ:
        eq();
        break;
      case OP_NEQ:
        neq();
        break;
      case OP_LESS:
        less();
        break;
      case OP_LEQ:
        leq();
        break;
      case OP_GRTR:
        grtr();
        break;
      case OP_GEQ:
        geq();
        break;
      case OP_SLESS:
        sless();
        break;
      case OP_SLEQ:
        sleq();
        break;
      case OP_SGRTR:
        sgrtr();
        break;
      case OP_SGEQ:
        sgeq();
        break;
      case OP_EQ_C_PRI:
        eq_c_pri(instr.operand());
        break;
      case OP_EQ_C_ALT:
        eq_c_alt(instr.operand());
        break;
      case OP_INC_PRI:
        inc_pri();
        break;
      case OP_INC_ALT:
        inc_alt();
        break;
      case OP_INC:
        inc(instr.operand());
        break;
      case OP_INC_S:
        inc_s(instr.operand());
        break;
      case OP_INC_I:
        inc_i();
        break;
      case OP_DEC_PRI:
        dec_pri();
        break;
      case OP_DEC_ALT:
        dec_alt();
        break;
      case OP_DEC:
        dec(instr.operand());
        break;
      case OP_DEC_S:
        dec_s(instr.operand());
        break;
      case OP_DEC_I:
        dec_i();
        break;
      case OP_MOVS:
        movs(instr.operand());
        break;
      case OP_CMPS:
        cmps(instr.operand());
        break;
      case OP_FILL:
        fill(instr.operand());
        break;
      case OP_HALT:
        halt(instr.operand());
        break;
      case OP_BOUNDS:
        bounds(instr.operand());
        break;
      case OP_SYSREQ_PRI:
        sysreq_pri();
        break;
      case OP_SYSREQ_C: {
        const char *name = amx.GetNativeName(instr.operand());
        if (name == 0) {
          error = true;
        } else {
          sysreq_c(instr.operand(), name);
        }
        break;
      }
      case OP_SYSREQ_D: {
        const char *name = amx.GetNativeName(amx.FindNative(instr.operand()));
        if (name == 0) {
          error = true;
        } else {
          sysreq_d(instr.operand(), name);
        }
        break;
      }
      case OP_SWITCH:
        switch_(CaseTable(amx, instr.operand()));
        break;
      case OP_CASETBL:
        casetbl();
        break;
      case OP_SWAP_PRI:
        swap_pri();
        break;
      case OP_SWAP_ALT:
        swap_alt();
        break;
      case OP_PUSH_ADR:
        push_adr(instr.operand());
        break;
      case OP_NOP:
        nop();
        break;
      case OP_BREAK:
        break_();
        break;
    default:
      error = true;
    }
  }

  if (error && error_handler_ != 0) {
    error_handler_->Execute(instr);
  }

  return Finish(error);
}