/*!
   Calculate a 3x3 dilate filter.

   \param dst  - [Output] Destination block pointer
   \param dstr - [Input]  Destination block stride
   \param src  - [Input]  Source block pointer
   \param sstr - [Input]  Source block stride
   \param bw   - [Input]  Block width
   \param bh   - [Input]  Block height
 */
void
apu_flt_dilate_3x3(
            vec08u* dst, int dstr,
      const vec08u* src, int sstr,
            int bw, int bh
)
{
   // Structuring element: Rectangular - hardcoded
      //  1,  1,  1,
      //  1,  1,  1,
      //  1,  1,  1,

   
   // Loop
   // vec16s a_max, a0, a1, a2, a3, a4, a5, a6, a7, a8;
   // vec16s b_max, b6, b7, b8;
   
   for (int y = 0; y < bh; ++y) chess_loop_range(1,) 
   {
      // Neighbors:
      const vec08u* ps0 = (src-1) + (y-1)*sstr;
      const vec08u* ps1 = (src-1) + (y  )*sstr;
      const vec08u* ps2 = (src-1) + (y+1)*sstr;
      
      vec16s chess_storage(V0) s0 = *ps0++;
      vec16s chess_storage(V1) s1 = *ps1++;
      vec16s chess_storage(V2) s2 = *ps2++;

      vec16s chess_storage(V4) amax = s0;    s0 = *ps0++;
               vswap(s1, amax, s1 > amax);   s1 = *ps1++;
               vswap(s2, amax, s2 > amax);   s2 = *ps2++;
      vec16s chess_storage(V5) bmax = s0;    s0 = *ps0++;
               vswap(s1, bmax, s1 > bmax);   s1 = *ps1++;
               vswap(s2, bmax, s2 > bmax);   s2 = *ps2++;
      
      for (int x = 0; x < bw; ++x) chess_loop_range(1,) 
      {
         vec16s chess_storage(V6) cmax = s0;    s0 = *ps0++;
                  vswap(s1, cmax, s1 > cmax);   s1 = *ps1++;
                  vswap(s2, cmax, s2 > cmax);   s2 = *ps2++;
               
         // Compare and find max
         vec16s o = amax;
         vec16s b = bmax; vswap(o, b, o < b);
         vec16s c = cmax; vswap(o, c, o < c);
         
         //save max of common pixels
         amax = bmax; bmax = cmax;
         // Assign to output
         dst[x] = (vec08u)o;
      }
      
      // Proceed to next line
      dst += dstr;
   }
}
void 
_BSort::quicksort3d(int lo, int hi, int depth)
{
  /* Initialize stack */
  int slo[QUICKSORT_STACK];
  int shi[QUICKSORT_STACK];
  int sd[QUICKSORT_STACK];
  int sp = 1;
  slo[0] = lo;
  shi[0] = hi;
  sd[0] = depth;
  // Recursion elimination loop
  while (--sp>=0)
    {
      lo = slo[sp];
      hi = shi[sp];
      depth = sd[sp];
      // Test for insertion sort
      if (depth >= PRESORT_DEPTH)
        {
          for (int i=lo; i<=hi; i++)
            rank[posn[i]] = hi;
        }
      else if (hi-lo<PRESORT_THRESH)
        {
          int i,j;
          for (i=lo+1; i<=hi; i++)
            {
              int tmp = posn[i];
              for(j=i-1; j>=lo && GTD(posn[j], tmp, depth); j--)
                posn[j+1] = posn[j];
              posn[j+1] = tmp;
            }
          for(i=hi;i>=lo;i=j)
            {
              int tmp = posn[i];
              rank[tmp] = i;
              for (j=i-1; j>=lo && !GTD(tmp,posn[j],depth); j--)
                rank[posn[j]] = i;
            }
        }
        else
        {
          int tmp;
          unsigned char *dd=data+depth;
          unsigned char med = pivot3d(dd,lo,hi);
          // -- positions are organized as follows:
          //   [lo..l1[ [l1..l[ ]h..h1] ]h1..hi]
          //      =        <       >        =
          int l1 = lo;
          int h1 = hi;
          while (dd[posn[l1]]==med && l1<h1) { l1++; }
          while (dd[posn[h1]]==med && l1<h1) { h1--; }
          int l = l1;
          int h = h1;
          // -- partition set
          for (;;)
            {
              while (l<=h)
                {
                  int c = (int)dd[posn[l]] - (int)med;
                  if (c > 0) break;
                  if (c == 0) { tmp=posn[l]; posn[l]=posn[l1]; posn[l1++]=tmp; }
                  l++;
                }
              while (l<=h)
                {
                  int c = (int)dd[posn[h]] - (int)med;
                  if (c < 0) break;
                  if (c == 0) { tmp=posn[h]; posn[h]=posn[h1]; posn[h1--]=tmp; }
                  h--;
                }
              if (l>h) break;
              tmp=posn[l]; posn[l]=posn[h]; posn[h]=tmp;
            }
          // -- reorganize as follows
          //   [lo..l1[ [l1..h1] ]h1..hi]
          //      <        =        > 
          tmp = mini(l1-lo, l-l1);
          vswap(lo, l-tmp, tmp, posn);
          l1 = lo + (l-l1);
          tmp = mini(hi-h1, h1-h);
          vswap(hi-tmp+1, h+1, tmp, posn);
          h1 = hi - (h1-h);
          // -- process segments
          ASSERT(sp+3<QUICKSORT_STACK);
          // ----- middle segment (=?) [l1, h1]
          l = l1; h = h1;
          if (med==0) // special case for marker [slow]
            for (int i=l; i<=h; i++)
              if ((int)posn[i]+depth == size-1)
                { 
                  tmp=posn[i]; posn[i]=posn[l]; posn[l]=tmp; 
                  rank[tmp]=l++; break; 
                }
          if (l<h)
            { slo[sp] = l; shi[sp] = h; sd[sp++] = depth+1; }
          else if (l==h)
            { rank[posn[h]] = h; }
          // ----- lower segment (<) [lo, l1[
          l = lo;
          h = l1-1;
          if (l<h)
            { slo[sp] = l; shi[sp] = h; sd[sp++] = depth; }
          else if (l==h)
            { rank[posn[h]] = h; }
          // ----- upper segment (>) ]h1, hi]
          l = h1+1;
          h = hi;
          if (l<h)
            { slo[sp] = l; shi[sp] = h; sd[sp++] = depth; }
          else if (l==h)
            { rank[posn[h]] = h; }
        }
    }
}
void 
_BSort::quicksort3r(int lo, int hi, int depth)
{
  /* Initialize stack */
  int slo[QUICKSORT_STACK];
  int shi[QUICKSORT_STACK];
  int sp = 1;
  slo[0] = lo;
  shi[0] = hi;
  // Recursion elimination loop
  while (--sp>=0)
    {
      lo = slo[sp];
      hi = shi[sp];
      // Test for insertion sort
      if (hi-lo<RANKSORT_THRESH)
        {
          ranksort(lo, hi, depth);
        }
      else
        {
          int tmp;
          int *rr=rank+depth;
          int med = pivot3r(rr,lo,hi);
          // -- positions are organized as follows:
          //   [lo..l1[ [l1..l[ ]h..h1] ]h1..hi]
          //      =        <       >        =
          int l1 = lo;
          int h1 = hi;
          while (rr[posn[l1]]==med && l1<h1) { l1++; }
          while (rr[posn[h1]]==med && l1<h1) { h1--; }
          int l = l1;
          int h = h1;
          // -- partition set
          for (;;)
            {
              while (l<=h)
                {
                  int c = rr[posn[l]] - med;
                  if (c > 0) break;
                  if (c == 0) { tmp=posn[l]; posn[l]=posn[l1]; posn[l1++]=tmp; }
                  l++;
                }
              while (l<=h)
                {
                  int c = rr[posn[h]] - med;
                  if (c < 0) break;
                  if (c == 0) { tmp=posn[h]; posn[h]=posn[h1]; posn[h1--]=tmp; }
                  h--;
                }
              if (l>h) break;
              tmp=posn[l]; posn[l]=posn[h]; posn[h]=tmp;
            }
          // -- reorganize as follows
          //   [lo..l1[ [l1..h1] ]h1..hi]
          //      <        =        > 
          tmp = mini(l1-lo, l-l1);
          vswap(lo, l-tmp, tmp, posn);
          l1 = lo + (l-l1);
          tmp = mini(hi-h1, h1-h);
          vswap(hi-tmp+1, h+1, tmp, posn);
          h1 = hi - (h1-h);
          // -- process segments
          ASSERT(sp+2<QUICKSORT_STACK);
          // ----- middle segment (=?) [l1, h1]
          for(int i=l1;i<=h1;i++) 
            rank[posn[i]] = h1;
          // ----- lower segment (<) [lo, l1[
          if (l1 > lo)
            {
              for(int i=lo;i<l1;i++) 
                rank[posn[i]]=l1-1;
              slo[sp]=lo;
              shi[sp]=l1-1;
              if (slo[sp] < shi[sp])  
                sp++;
            }
          // ----- upper segment (>) ]h1, hi]
          if (h1 < hi)
            {
              slo[sp]=h1+1;
              shi[sp]=hi;
              if (slo[sp] < shi[sp])  
                sp++;
            }
        }
    }
}
Example #4
0
// Generate an integer binary operation
void gen_opi(int op) {
  int r, fr, opc, c;

  switch (op) {
    case '+':
    case TOK_ADDC1: // Add with carry generation
      opc = 0;
    gen_op8:
      if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST) {
        // Constant case
        vswap();
        r = gv(RC_INT);
        vswap();
        c = vtop->c.i;
        if (c == (char) c) {
          // Optimize +/- 1 case with inc and dec
          if (op == '+' && c == 1 || op == '-' && c == -1) {
            o(0x40 | r);  // inc r
          } else if (op == '-' && c == 1 || op == '+' && c == -1) {
            o(0x48 | r);  // dec r
          } else {
            o(0x83);
            o(0xc0 | (opc << 3) | r);
            g(c);
          }
        } else {
          o(0x81);
          oad(0xc0 | (opc << 3) | r, c);
        }
      } else {
        gv2(RC_INT, RC_INT);
        r = vtop[-1].r;
        fr = vtop[0].r;
        o((opc << 3) | 0x01);
        o(0xc0 + r + fr * 8); 
      }
      vtop--;
      if (op >= TOK_ULT && op <= TOK_GT) {
        vtop->r = VT_CMP;
        vtop->c.i = op;
      }
      break;
    case '-':
    case TOK_SUBC1: // Subtract with carry generation
      opc = 5;
      goto gen_op8;
    case TOK_ADDC2: // Add with carry use
      opc = 2;
      goto gen_op8;
    case TOK_SUBC2: // Subtract with carry use
      opc = 3;
      goto gen_op8;
    case '&':
      opc = 4;
      goto gen_op8;
    case '^':
      opc = 6;
      goto gen_op8;
    case '|':
      opc = 1;
      goto gen_op8;
    case '*':
      gv2(RC_INT, RC_INT);
      r = vtop[-1].r;
      fr = vtop[0].r;
      vtop--;
      o(0xaf0f); // imul fr, r
      o(0xc0 + fr + r * 8);
      break;
    case TOK_SHL:
      opc = 4;
      goto gen_shift;
    case TOK_SHR:
      opc = 5;
      goto gen_shift;
    case TOK_SAR:
      opc = 7;
    gen_shift:
      opc = 0xc0 | (opc << 3);
      if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST) {
        // Constant case
        vswap();
        r = gv(RC_INT);
        vswap();
        c = vtop->c.i & 0x1f;
        o(0xc1); // shl/shr/sar $xxx, r
        o(opc | r);
        g(c);
      } else {
        // Generate the shift in ecx
        gv2(RC_INT, RC_ECX);
        r = vtop[-1].r;
        o(0xd3); // shl/shr/sar %cl, r
        o(opc | r);
      }
      vtop--;
      break;
    case '/':
    case TOK_UDIV:
    case TOK_PDIV:
    case '%':
    case TOK_UMOD:
    case TOK_UMULL:
      // First operand must be in eax
      // TODO: need better constraint for second operand
      gv2(RC_EAX, RC_ECX);
      r = vtop[-1].r;
      fr = vtop[0].r;
      vtop--;
      save_reg(TREG_EDX);
      if (op == TOK_UMULL) {
        o(0xf7); // mul fr
        o(0xe0 + fr);
        vtop->r2 = TREG_EDX;
        r = TREG_EAX;
      } else {
        if (op == TOK_UDIV || op == TOK_UMOD) {
          o(0xf7d231); // xor %edx, %edx, div fr, %eax
          o(0xf0 + fr);
        } else {
          o(0xf799); // cltd, idiv fr, %eax
          o(0xf8 + fr);
        }
        if (op == '%' || op == TOK_UMOD) {
          r = TREG_EDX;
        } else {
          r = TREG_EAX;
        }
      }
      vtop->r = r;
      break;
    default:
      opc = 7;
      goto gen_op8;
  }
}
Example #5
0
// Generate function call. The function address is pushed first, then
// all the parameters in call order. This function pops all the
// parameters and the function address.
void gfunc_call(int nb_args) {
  int size, align, r, args_size, i, func_call, v;
  Sym *func_sym;
  
  args_size = 0;
  for (i = 0; i < nb_args; i++) {
    if ((vtop->type.t & VT_BTYPE) == VT_STRUCT) {
      size = type_size(&vtop->type, &align);
      // Align to stack align size
      size = (size + 3) & ~3;
      // Allocate the necessary size on stack
      oad(0xec81, size); // sub $xxx, %esp
      // Generate structure store
      r = get_reg(RC_INT);
      o(0x89); // mov %esp, r
      o(0xe0 + r);
      vset(&vtop->type, r | VT_LVAL, 0);
      vswap();
      vstore();
      args_size += size;
    } else if (is_float(vtop->type.t)) {
      gv(RC_FLOAT); // Only one float register
      if ((vtop->type.t & VT_BTYPE) == VT_FLOAT) {
        size = 4;
      } else if ((vtop->type.t & VT_BTYPE) == VT_DOUBLE) {
        size = 8;
      } else {
        size = 12;
      }
      oad(0xec81, size); // sub $xxx, %esp
      if (size == 12) {
        o(0x7cdb);
      } else {
        o(0x5cd9 + size - 4); // fstp[s|l] 0(%esp)
      }
      g(0x24);
      g(0x00);
      args_size += size;
    } else {
      // Simple type (currently always same size)
      // TODO: implicit cast?
      v = vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM);
      if (v == VT_CONST || v == (VT_CONST | VT_SYM)) {
        // Push constant
        if ((vtop->type.t & VT_BTYPE) == VT_LLONG) {
          size = 8;
          if (vtop->c.word[1] == (char) vtop->c.word[1]) {
            g(0x6a); // push imm8
            g(vtop->c.word[1]);
          } else {
            g(0x68); // push imm32
            gen_le32(vtop->c.word[1]);
          }
        } else {
          size = 4;
        }
        if ((v & VT_SYM) == 0 && vtop->c.i == (char) vtop->c.i) {
          g(0x6a); // push imm8
          g(vtop->c.i);
        } else {
          g(0x68); // push imm32
          gen_addr32(v, vtop->sym, vtop->c.i);
        }
      } else {
        r = gv(RC_INT);
        if ((vtop->type.t & VT_BTYPE) == VT_LLONG) {
          size = 8;
          o(0x50 + vtop->r2); // push r2
        } else {
          size = 4;
        }
        o(0x50 + r); // push r
      }
      args_size += size;
    }
    vtop--;
  }
  save_regs(0); // Save used temporary registers
  func_sym = vtop->type.ref;
  func_call = FUNC_CALL(func_sym->r);

  // fast call case
  if ((func_call >= FUNC_FASTCALL1 && func_call <= FUNC_FASTCALL3) ||
    func_call == FUNC_FASTCALLW) {
    int fastcall_nb_regs;
    uint8_t *fastcall_regs_ptr;
    if (func_call == FUNC_FASTCALLW) {
      fastcall_regs_ptr = fastcallw_regs;
      fastcall_nb_regs = 2;
    } else {
      fastcall_regs_ptr = fastcall_regs;
      fastcall_nb_regs = func_call - FUNC_FASTCALL1 + 1;
    }
    for (i = 0; i < fastcall_nb_regs; i++) {
      if (args_size <= 0) break;
      o(0x58 + fastcall_regs_ptr[i]); // pop r
      // TODO: incorrect for struct/floats
      args_size -= 4;
    }
  }
  gcall_or_jmp(0);
  if (args_size && func_call != FUNC_STDCALL) gadd_sp(args_size);
  vtop--;
}
Example #6
0
// Generate a floating point operation 'v = t1 op t2' instruction. The
// two operands are guaranted to have the same floating point type
// TODO: need to use ST1 too
void gen_opf(int op) {
  int a, ft, fc, swapped, r;

  // Convert constants to memory references
  if ((vtop[-1].r & (VT_VALMASK | VT_LVAL)) == VT_CONST) {
    vswap();
    gv(RC_FLOAT);
    vswap();
  }
  if ((vtop[0].r & (VT_VALMASK | VT_LVAL)) == VT_CONST) gv(RC_FLOAT);

  // Must put at least one value in the floating point register
  if ((vtop[-1].r & VT_LVAL) && (vtop[0].r & VT_LVAL)) {
    vswap();
    gv(RC_FLOAT);
    vswap();
  }
  swapped = 0;
  // Swap the stack if needed so that t1 is the register and t2 is the memory reference
  if (vtop[-1].r & VT_LVAL) {
    vswap();
    swapped = 1;
  }
  if (op >= TOK_ULT && op <= TOK_GT) {
    // Load on stack second operand
    load(TREG_ST0, vtop);
    save_reg(TREG_EAX); // eax is used by FP comparison code
    if (op == TOK_GE || op == TOK_GT) {
      swapped = !swapped;
    } else if (op == TOK_EQ || op == TOK_NE) {
      swapped = 0;
    }
    if (swapped) o(0xc9d9); // fxch %st(1)
    o(0xe9da); // fucompp
    o(0xe0df); // fnstsw %ax
    if (op == TOK_EQ) {
      o(0x45e480); // and $0x45, %ah
      o(0x40fC80); // cmp $0x40, %ah
    } else if (op == TOK_NE) {
      o(0x45e480); // and $0x45, %ah
      o(0x40f480); // xor $0x40, %ah
      op = TOK_NE;
    } else if (op == TOK_GE || op == TOK_LE) {
      o(0x05c4f6); // test $0x05, %ah
      op = TOK_EQ;
    } else {
      o(0x45c4f6); // test $0x45, %ah
      op = TOK_EQ;
    }
    vtop--;
    vtop->r = VT_CMP;
    vtop->c.i = op;
  } else {
    // No memory reference possible for long double operations
    if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) {
      load(TREG_ST0, vtop);
      swapped = !swapped;
    }

    switch (op) {
      case '+':
        a = 0;
        break;
      case '-':
        a = 4;
        if (swapped) a++;
        break;
      case '*':
        a = 1;
        break;
      case '/':
        a = 6;
        if (swapped) a++;
        break;
      default:
        a = 0;
    }
    ft = vtop->type.t;
    fc = vtop->c.ul;
    if ((ft & VT_BTYPE) == VT_LDOUBLE) {
      o(0xde); // fxxxp %st, %st(1)
      o(0xc1 + (a << 3));
    } else {
      // If saved lvalue, then we must reload it
      r = vtop->r;
      if ((r & VT_VALMASK) == VT_LLOCAL) {
        SValue v1;
        r = get_reg(RC_INT);
        v1.type.t = VT_INT;
        v1.r = VT_LOCAL | VT_LVAL;
        v1.c.ul = fc;
        load(r, &v1);
        fc = 0;
      }

      if ((ft & VT_BTYPE) == VT_DOUBLE) {
        o(0xdc);
      } else {
        o(0xd8);
      }
      gen_modrm(a, r, vtop->sym, fc);
    }
    vtop--;
  }
}
Example #7
0
/* generate an integer binary operation */
void gen_opi(int op)
{
    int r, fr, opc, c;

    switch(op) {
    case '+':
    case TOK_ADDC1: /* add with carry generation */
        opc = 0;
    gen_op8:
        if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST) {
            /* constant case */
            vswap();
            r = gv(RC_INT);
            vswap();
            c = vtop->c.i;
            if (c == (char)c) {
                /* XXX: generate inc and dec for smaller code ? */
                o(0x83);
                o(0xc0 | (opc << 3) | r);
                g(c);
            } else {
                o(0x81);
                oad(0xc0 | (opc << 3) | r, c);
            }
        } else {
            gv2(RC_INT, RC_INT);
            r = vtop[-1].r;
            fr = vtop[0].r;
            o((opc << 3) | 0x01);
            o(0xc0 + r + fr * 8); 
        }
        vtop--;
        if (op >= TOK_ULT && op <= TOK_GT) {
            vtop->r = VT_CMP;
            vtop->c.i = op;
        }
        break;
    case '-':
    case TOK_SUBC1: /* sub with carry generation */
        opc = 5;
        goto gen_op8;
    case TOK_ADDC2: /* add with carry use */
        opc = 2;
        goto gen_op8;
    case TOK_SUBC2: /* sub with carry use */
        opc = 3;
        goto gen_op8;
    case '&':
        opc = 4;
        goto gen_op8;
    case '^':
        opc = 6;
        goto gen_op8;
    case '|':
        opc = 1;
        goto gen_op8;
    case '*':
        gv2(RC_INT, RC_INT);
        r = vtop[-1].r;
        fr = vtop[0].r;
        vtop--;
        o(0xaf0f); /* imul fr, r */
        o(0xc0 + fr + r * 8);
        break;
    case TOK_SHL:
        opc = 4;
        goto gen_shift;
    case TOK_SHR:
        opc = 5;
        goto gen_shift;
    case TOK_SAR:
        opc = 7;
    gen_shift:
        opc = 0xc0 | (opc << 3);
        if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST) {
            /* constant case */
            vswap();
            r = gv(RC_INT);
            vswap();
            c = vtop->c.i & 0x1f;
            o(0xc1); /* shl/shr/sar $xxx, r */
            o(opc | r);
            g(c);
        } else {
            /* we generate the shift in ecx */
            gv2(RC_INT, RC_ECX);
            r = vtop[-1].r;
            o(0xd3); /* shl/shr/sar %cl, r */
            o(opc | r);
        }
        vtop--;
        break;
    case '/':
    case TOK_UDIV:
    case TOK_PDIV:
    case '%':
    case TOK_UMOD:
    case TOK_UMULL:
        /* first operand must be in eax */
        /* XXX: need better constraint for second operand */
        gv2(RC_EAX, RC_ECX);
        r = vtop[-1].r;
        fr = vtop[0].r;
        vtop--;
        save_reg(TREG_EDX);
        if (op == TOK_UMULL) {
            o(0xf7); /* mul fr */
            o(0xe0 + fr);
            vtop->r2 = TREG_EDX;
            r = TREG_EAX;
        } else {
            if (op == TOK_UDIV || op == TOK_UMOD) {
                o(0xf7d231); /* xor %edx, %edx, div fr, %eax */
                o(0xf0 + fr);
            } else {
                o(0xf799); /* cltd, idiv fr, %eax */
                o(0xf8 + fr);
            }
            if (op == '%' || op == TOK_UMOD)
                r = TREG_EDX;
            else
                r = TREG_EAX;
        }
        vtop->r = r;
        break;
    default:
        opc = 7;
        goto gen_op8;
    }
}
Example #8
0
/* Generate function call. The function address is pushed first, then
   all the parameters in call order. This functions pops all the
   parameters and the function address. */
void gfunc_call(int nb_args)
{
    int size, align, r, args_size, i, func_call;
    Sym *func_sym;
    
    args_size = 0;
    for(i = 0;i < nb_args; i++) {
        if ((vtop->type.t & VT_BTYPE) == VT_STRUCT) {
            size = type_size(&vtop->type, &align);
            /* align to stack align size */
            size = (size + 3) & ~3;
            /* allocate the necessary size on stack */
            oad(0xec81, size); /* sub $xxx, %esp */
            /* generate structure store */
            r = get_reg(RC_INT);
            o(0x89); /* mov %esp, r */
            o(0xe0 + r);
            vset(&vtop->type, r | VT_LVAL, 0);
            vswap();
            vstore();
            args_size += size;
        } else if (is_float(vtop->type.t)) {
            gv(RC_FLOAT); /* only one float register */
            if ((vtop->type.t & VT_BTYPE) == VT_FLOAT)
                size = 4;
            else if ((vtop->type.t & VT_BTYPE) == VT_DOUBLE)
                size = 8;
            else
                size = 12;
            oad(0xec81, size); /* sub $xxx, %esp */
            if (size == 12)
                o(0x7cdb);
            else
                o(0x5cd9 + size - 4); /* fstp[s|l] 0(%esp) */
            g(0x24);
            g(0x00);
            args_size += size;
        } else {
            /* simple type (currently always same size) */
            /* XXX: implicit cast ? */
            r = gv(RC_INT);
            if ((vtop->type.t & VT_BTYPE) == VT_LLONG) {
                size = 8;
                o(0x50 + vtop->r2); /* push r */
            } else {
                size = 4;
            }
            o(0x50 + r); /* push r */
            args_size += size;
        }
        vtop--;
    }
    save_regs(0); /* save used temporary registers */
    func_sym = vtop->type.ref;
    func_call = FUNC_CALL(func_sym->r);
    /* fast call case */
    if ((func_call >= FUNC_FASTCALL1 && func_call <= FUNC_FASTCALL3) ||
        func_call == FUNC_FASTCALLW) {
        int fastcall_nb_regs;
        uint8_t *fastcall_regs_ptr;
        if (func_call == FUNC_FASTCALLW) {
            fastcall_regs_ptr = fastcallw_regs;
            fastcall_nb_regs = 2;
        } else {
            fastcall_regs_ptr = fastcall_regs;
            fastcall_nb_regs = func_call - FUNC_FASTCALL1 + 1;
        }
        for(i = 0;i < fastcall_nb_regs; i++) {
            if (args_size <= 0)
                break;
            o(0x58 + fastcall_regs_ptr[i]); /* pop r */
            /* XXX: incorrect for struct/floats */
            args_size -= 4;
        }
    }
    gcall_or_jmp(0);
    if (args_size && func_call != FUNC_STDCALL)
        gadd_sp(args_size);
    vtop--;
}
void
apu_flt_dilate_3x3(
            vec08u* dst, int dstr,
      const vec08u* src, int sstr,
            int bw, int bh
)
{
   // Structuring element: Rectangular - hardcoded
      //  1,  1,  1,
      //  1,  1,  1,
      //  1,  1,  1,

   // Neighbors:
   const vec08u* s0 = src - sstr;
   const vec08u* s1 = src;
   const vec08u* s2 = src + sstr;
   
   // Loop
   vec16s a_max, a0, a1, a2, a3, a4, a5, a6, a7, a8;
   vec16s b_max, b6, b7, b8;
   
   for (int y = 0; y < bh; ++y) chess_loop_range(1,) 
   {
      for (int x = 0; x < bw; x+=2) chess_loop_range(1,) 
      {
         // Copy src cells
         a0 = s0[x];
         a1 = s0[x + 1];
         a2 = s1[x];
         a3 = s1[x + 1];
         a4 = s2[x];
         a5 = s2[x + 1];
         a6 = s0[x - 1];
         a7 = s1[x - 1];
         a8 = s2[x - 1];
         
         b6 = s0[x + 2];
         b7 = s1[x + 2];
         b8 = s2[x + 2];
            
         // Compare and find max
         a_max = a0;

         vswap(a1, a_max, a1 > a_max);
         vswap(a2, a_max, a2 > a_max);
         vswap(a3, a_max, a3 > a_max);
         vswap(a4, a_max, a4 > a_max);
         vswap(a5, a_max, a5 > a_max);
         
         //save max of common pixels
         b_max = a_max;
         
         vswap(a6, a_max, a6 > a_max);
         vswap(a7, a_max, a7 > a_max);
         vswap(a8, a_max, a8 > a_max);
         
         vswap(b6, b_max, b6 > b_max);
         vswap(b7, b_max, b7 > b_max);
         vswap(b8, b_max, b8 > b_max);
        
         // Assign to output
         dst[x]      = (vec08u)a_max;
         dst[x + 1]  = (vec08u)b_max;
      }
      
      // Proceed to next block
      s0   = s1;
      s1   = s2; 
      s2  += sstr;
      dst += dstr;
   }
}