/*! Calculate a 3x3 dilate filter. \param dst - [Output] Destination block pointer \param dstr - [Input] Destination block stride \param src - [Input] Source block pointer \param sstr - [Input] Source block stride \param bw - [Input] Block width \param bh - [Input] Block height */ void apu_flt_dilate_3x3( vec08u* dst, int dstr, const vec08u* src, int sstr, int bw, int bh ) { // Structuring element: Rectangular - hardcoded // 1, 1, 1, // 1, 1, 1, // 1, 1, 1, // Loop // vec16s a_max, a0, a1, a2, a3, a4, a5, a6, a7, a8; // vec16s b_max, b6, b7, b8; for (int y = 0; y < bh; ++y) chess_loop_range(1,) { // Neighbors: const vec08u* ps0 = (src-1) + (y-1)*sstr; const vec08u* ps1 = (src-1) + (y )*sstr; const vec08u* ps2 = (src-1) + (y+1)*sstr; vec16s chess_storage(V0) s0 = *ps0++; vec16s chess_storage(V1) s1 = *ps1++; vec16s chess_storage(V2) s2 = *ps2++; vec16s chess_storage(V4) amax = s0; s0 = *ps0++; vswap(s1, amax, s1 > amax); s1 = *ps1++; vswap(s2, amax, s2 > amax); s2 = *ps2++; vec16s chess_storage(V5) bmax = s0; s0 = *ps0++; vswap(s1, bmax, s1 > bmax); s1 = *ps1++; vswap(s2, bmax, s2 > bmax); s2 = *ps2++; for (int x = 0; x < bw; ++x) chess_loop_range(1,) { vec16s chess_storage(V6) cmax = s0; s0 = *ps0++; vswap(s1, cmax, s1 > cmax); s1 = *ps1++; vswap(s2, cmax, s2 > cmax); s2 = *ps2++; // Compare and find max vec16s o = amax; vec16s b = bmax; vswap(o, b, o < b); vec16s c = cmax; vswap(o, c, o < c); //save max of common pixels amax = bmax; bmax = cmax; // Assign to output dst[x] = (vec08u)o; } // Proceed to next line dst += dstr; } }
void _BSort::quicksort3d(int lo, int hi, int depth) { /* Initialize stack */ int slo[QUICKSORT_STACK]; int shi[QUICKSORT_STACK]; int sd[QUICKSORT_STACK]; int sp = 1; slo[0] = lo; shi[0] = hi; sd[0] = depth; // Recursion elimination loop while (--sp>=0) { lo = slo[sp]; hi = shi[sp]; depth = sd[sp]; // Test for insertion sort if (depth >= PRESORT_DEPTH) { for (int i=lo; i<=hi; i++) rank[posn[i]] = hi; } else if (hi-lo<PRESORT_THRESH) { int i,j; for (i=lo+1; i<=hi; i++) { int tmp = posn[i]; for(j=i-1; j>=lo && GTD(posn[j], tmp, depth); j--) posn[j+1] = posn[j]; posn[j+1] = tmp; } for(i=hi;i>=lo;i=j) { int tmp = posn[i]; rank[tmp] = i; for (j=i-1; j>=lo && !GTD(tmp,posn[j],depth); j--) rank[posn[j]] = i; } } else { int tmp; unsigned char *dd=data+depth; unsigned char med = pivot3d(dd,lo,hi); // -- positions are organized as follows: // [lo..l1[ [l1..l[ ]h..h1] ]h1..hi] // = < > = int l1 = lo; int h1 = hi; while (dd[posn[l1]]==med && l1<h1) { l1++; } while (dd[posn[h1]]==med && l1<h1) { h1--; } int l = l1; int h = h1; // -- partition set for (;;) { while (l<=h) { int c = (int)dd[posn[l]] - (int)med; if (c > 0) break; if (c == 0) { tmp=posn[l]; posn[l]=posn[l1]; posn[l1++]=tmp; } l++; } while (l<=h) { int c = (int)dd[posn[h]] - (int)med; if (c < 0) break; if (c == 0) { tmp=posn[h]; posn[h]=posn[h1]; posn[h1--]=tmp; } h--; } if (l>h) break; tmp=posn[l]; posn[l]=posn[h]; posn[h]=tmp; } // -- reorganize as follows // [lo..l1[ [l1..h1] ]h1..hi] // < = > tmp = mini(l1-lo, l-l1); vswap(lo, l-tmp, tmp, posn); l1 = lo + (l-l1); tmp = mini(hi-h1, h1-h); vswap(hi-tmp+1, h+1, tmp, posn); h1 = hi - (h1-h); // -- process segments ASSERT(sp+3<QUICKSORT_STACK); // ----- middle segment (=?) [l1, h1] l = l1; h = h1; if (med==0) // special case for marker [slow] for (int i=l; i<=h; i++) if ((int)posn[i]+depth == size-1) { tmp=posn[i]; posn[i]=posn[l]; posn[l]=tmp; rank[tmp]=l++; break; } if (l<h) { slo[sp] = l; shi[sp] = h; sd[sp++] = depth+1; } else if (l==h) { rank[posn[h]] = h; } // ----- lower segment (<) [lo, l1[ l = lo; h = l1-1; if (l<h) { slo[sp] = l; shi[sp] = h; sd[sp++] = depth; } else if (l==h) { rank[posn[h]] = h; } // ----- upper segment (>) ]h1, hi] l = h1+1; h = hi; if (l<h) { slo[sp] = l; shi[sp] = h; sd[sp++] = depth; } else if (l==h) { rank[posn[h]] = h; } } } }
void _BSort::quicksort3r(int lo, int hi, int depth) { /* Initialize stack */ int slo[QUICKSORT_STACK]; int shi[QUICKSORT_STACK]; int sp = 1; slo[0] = lo; shi[0] = hi; // Recursion elimination loop while (--sp>=0) { lo = slo[sp]; hi = shi[sp]; // Test for insertion sort if (hi-lo<RANKSORT_THRESH) { ranksort(lo, hi, depth); } else { int tmp; int *rr=rank+depth; int med = pivot3r(rr,lo,hi); // -- positions are organized as follows: // [lo..l1[ [l1..l[ ]h..h1] ]h1..hi] // = < > = int l1 = lo; int h1 = hi; while (rr[posn[l1]]==med && l1<h1) { l1++; } while (rr[posn[h1]]==med && l1<h1) { h1--; } int l = l1; int h = h1; // -- partition set for (;;) { while (l<=h) { int c = rr[posn[l]] - med; if (c > 0) break; if (c == 0) { tmp=posn[l]; posn[l]=posn[l1]; posn[l1++]=tmp; } l++; } while (l<=h) { int c = rr[posn[h]] - med; if (c < 0) break; if (c == 0) { tmp=posn[h]; posn[h]=posn[h1]; posn[h1--]=tmp; } h--; } if (l>h) break; tmp=posn[l]; posn[l]=posn[h]; posn[h]=tmp; } // -- reorganize as follows // [lo..l1[ [l1..h1] ]h1..hi] // < = > tmp = mini(l1-lo, l-l1); vswap(lo, l-tmp, tmp, posn); l1 = lo + (l-l1); tmp = mini(hi-h1, h1-h); vswap(hi-tmp+1, h+1, tmp, posn); h1 = hi - (h1-h); // -- process segments ASSERT(sp+2<QUICKSORT_STACK); // ----- middle segment (=?) [l1, h1] for(int i=l1;i<=h1;i++) rank[posn[i]] = h1; // ----- lower segment (<) [lo, l1[ if (l1 > lo) { for(int i=lo;i<l1;i++) rank[posn[i]]=l1-1; slo[sp]=lo; shi[sp]=l1-1; if (slo[sp] < shi[sp]) sp++; } // ----- upper segment (>) ]h1, hi] if (h1 < hi) { slo[sp]=h1+1; shi[sp]=hi; if (slo[sp] < shi[sp]) sp++; } } } }
// Generate an integer binary operation void gen_opi(int op) { int r, fr, opc, c; switch (op) { case '+': case TOK_ADDC1: // Add with carry generation opc = 0; gen_op8: if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST) { // Constant case vswap(); r = gv(RC_INT); vswap(); c = vtop->c.i; if (c == (char) c) { // Optimize +/- 1 case with inc and dec if (op == '+' && c == 1 || op == '-' && c == -1) { o(0x40 | r); // inc r } else if (op == '-' && c == 1 || op == '+' && c == -1) { o(0x48 | r); // dec r } else { o(0x83); o(0xc0 | (opc << 3) | r); g(c); } } else { o(0x81); oad(0xc0 | (opc << 3) | r, c); } } else { gv2(RC_INT, RC_INT); r = vtop[-1].r; fr = vtop[0].r; o((opc << 3) | 0x01); o(0xc0 + r + fr * 8); } vtop--; if (op >= TOK_ULT && op <= TOK_GT) { vtop->r = VT_CMP; vtop->c.i = op; } break; case '-': case TOK_SUBC1: // Subtract with carry generation opc = 5; goto gen_op8; case TOK_ADDC2: // Add with carry use opc = 2; goto gen_op8; case TOK_SUBC2: // Subtract with carry use opc = 3; goto gen_op8; case '&': opc = 4; goto gen_op8; case '^': opc = 6; goto gen_op8; case '|': opc = 1; goto gen_op8; case '*': gv2(RC_INT, RC_INT); r = vtop[-1].r; fr = vtop[0].r; vtop--; o(0xaf0f); // imul fr, r o(0xc0 + fr + r * 8); break; case TOK_SHL: opc = 4; goto gen_shift; case TOK_SHR: opc = 5; goto gen_shift; case TOK_SAR: opc = 7; gen_shift: opc = 0xc0 | (opc << 3); if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST) { // Constant case vswap(); r = gv(RC_INT); vswap(); c = vtop->c.i & 0x1f; o(0xc1); // shl/shr/sar $xxx, r o(opc | r); g(c); } else { // Generate the shift in ecx gv2(RC_INT, RC_ECX); r = vtop[-1].r; o(0xd3); // shl/shr/sar %cl, r o(opc | r); } vtop--; break; case '/': case TOK_UDIV: case TOK_PDIV: case '%': case TOK_UMOD: case TOK_UMULL: // First operand must be in eax // TODO: need better constraint for second operand gv2(RC_EAX, RC_ECX); r = vtop[-1].r; fr = vtop[0].r; vtop--; save_reg(TREG_EDX); if (op == TOK_UMULL) { o(0xf7); // mul fr o(0xe0 + fr); vtop->r2 = TREG_EDX; r = TREG_EAX; } else { if (op == TOK_UDIV || op == TOK_UMOD) { o(0xf7d231); // xor %edx, %edx, div fr, %eax o(0xf0 + fr); } else { o(0xf799); // cltd, idiv fr, %eax o(0xf8 + fr); } if (op == '%' || op == TOK_UMOD) { r = TREG_EDX; } else { r = TREG_EAX; } } vtop->r = r; break; default: opc = 7; goto gen_op8; } }
// Generate function call. The function address is pushed first, then // all the parameters in call order. This function pops all the // parameters and the function address. void gfunc_call(int nb_args) { int size, align, r, args_size, i, func_call, v; Sym *func_sym; args_size = 0; for (i = 0; i < nb_args; i++) { if ((vtop->type.t & VT_BTYPE) == VT_STRUCT) { size = type_size(&vtop->type, &align); // Align to stack align size size = (size + 3) & ~3; // Allocate the necessary size on stack oad(0xec81, size); // sub $xxx, %esp // Generate structure store r = get_reg(RC_INT); o(0x89); // mov %esp, r o(0xe0 + r); vset(&vtop->type, r | VT_LVAL, 0); vswap(); vstore(); args_size += size; } else if (is_float(vtop->type.t)) { gv(RC_FLOAT); // Only one float register if ((vtop->type.t & VT_BTYPE) == VT_FLOAT) { size = 4; } else if ((vtop->type.t & VT_BTYPE) == VT_DOUBLE) { size = 8; } else { size = 12; } oad(0xec81, size); // sub $xxx, %esp if (size == 12) { o(0x7cdb); } else { o(0x5cd9 + size - 4); // fstp[s|l] 0(%esp) } g(0x24); g(0x00); args_size += size; } else { // Simple type (currently always same size) // TODO: implicit cast? v = vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM); if (v == VT_CONST || v == (VT_CONST | VT_SYM)) { // Push constant if ((vtop->type.t & VT_BTYPE) == VT_LLONG) { size = 8; if (vtop->c.word[1] == (char) vtop->c.word[1]) { g(0x6a); // push imm8 g(vtop->c.word[1]); } else { g(0x68); // push imm32 gen_le32(vtop->c.word[1]); } } else { size = 4; } if ((v & VT_SYM) == 0 && vtop->c.i == (char) vtop->c.i) { g(0x6a); // push imm8 g(vtop->c.i); } else { g(0x68); // push imm32 gen_addr32(v, vtop->sym, vtop->c.i); } } else { r = gv(RC_INT); if ((vtop->type.t & VT_BTYPE) == VT_LLONG) { size = 8; o(0x50 + vtop->r2); // push r2 } else { size = 4; } o(0x50 + r); // push r } args_size += size; } vtop--; } save_regs(0); // Save used temporary registers func_sym = vtop->type.ref; func_call = FUNC_CALL(func_sym->r); // fast call case if ((func_call >= FUNC_FASTCALL1 && func_call <= FUNC_FASTCALL3) || func_call == FUNC_FASTCALLW) { int fastcall_nb_regs; uint8_t *fastcall_regs_ptr; if (func_call == FUNC_FASTCALLW) { fastcall_regs_ptr = fastcallw_regs; fastcall_nb_regs = 2; } else { fastcall_regs_ptr = fastcall_regs; fastcall_nb_regs = func_call - FUNC_FASTCALL1 + 1; } for (i = 0; i < fastcall_nb_regs; i++) { if (args_size <= 0) break; o(0x58 + fastcall_regs_ptr[i]); // pop r // TODO: incorrect for struct/floats args_size -= 4; } } gcall_or_jmp(0); if (args_size && func_call != FUNC_STDCALL) gadd_sp(args_size); vtop--; }
// Generate a floating point operation 'v = t1 op t2' instruction. The // two operands are guaranted to have the same floating point type // TODO: need to use ST1 too void gen_opf(int op) { int a, ft, fc, swapped, r; // Convert constants to memory references if ((vtop[-1].r & (VT_VALMASK | VT_LVAL)) == VT_CONST) { vswap(); gv(RC_FLOAT); vswap(); } if ((vtop[0].r & (VT_VALMASK | VT_LVAL)) == VT_CONST) gv(RC_FLOAT); // Must put at least one value in the floating point register if ((vtop[-1].r & VT_LVAL) && (vtop[0].r & VT_LVAL)) { vswap(); gv(RC_FLOAT); vswap(); } swapped = 0; // Swap the stack if needed so that t1 is the register and t2 is the memory reference if (vtop[-1].r & VT_LVAL) { vswap(); swapped = 1; } if (op >= TOK_ULT && op <= TOK_GT) { // Load on stack second operand load(TREG_ST0, vtop); save_reg(TREG_EAX); // eax is used by FP comparison code if (op == TOK_GE || op == TOK_GT) { swapped = !swapped; } else if (op == TOK_EQ || op == TOK_NE) { swapped = 0; } if (swapped) o(0xc9d9); // fxch %st(1) o(0xe9da); // fucompp o(0xe0df); // fnstsw %ax if (op == TOK_EQ) { o(0x45e480); // and $0x45, %ah o(0x40fC80); // cmp $0x40, %ah } else if (op == TOK_NE) { o(0x45e480); // and $0x45, %ah o(0x40f480); // xor $0x40, %ah op = TOK_NE; } else if (op == TOK_GE || op == TOK_LE) { o(0x05c4f6); // test $0x05, %ah op = TOK_EQ; } else { o(0x45c4f6); // test $0x45, %ah op = TOK_EQ; } vtop--; vtop->r = VT_CMP; vtop->c.i = op; } else { // No memory reference possible for long double operations if ((vtop->type.t & VT_BTYPE) == VT_LDOUBLE) { load(TREG_ST0, vtop); swapped = !swapped; } switch (op) { case '+': a = 0; break; case '-': a = 4; if (swapped) a++; break; case '*': a = 1; break; case '/': a = 6; if (swapped) a++; break; default: a = 0; } ft = vtop->type.t; fc = vtop->c.ul; if ((ft & VT_BTYPE) == VT_LDOUBLE) { o(0xde); // fxxxp %st, %st(1) o(0xc1 + (a << 3)); } else { // If saved lvalue, then we must reload it r = vtop->r; if ((r & VT_VALMASK) == VT_LLOCAL) { SValue v1; r = get_reg(RC_INT); v1.type.t = VT_INT; v1.r = VT_LOCAL | VT_LVAL; v1.c.ul = fc; load(r, &v1); fc = 0; } if ((ft & VT_BTYPE) == VT_DOUBLE) { o(0xdc); } else { o(0xd8); } gen_modrm(a, r, vtop->sym, fc); } vtop--; } }
/* generate an integer binary operation */ void gen_opi(int op) { int r, fr, opc, c; switch(op) { case '+': case TOK_ADDC1: /* add with carry generation */ opc = 0; gen_op8: if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST) { /* constant case */ vswap(); r = gv(RC_INT); vswap(); c = vtop->c.i; if (c == (char)c) { /* XXX: generate inc and dec for smaller code ? */ o(0x83); o(0xc0 | (opc << 3) | r); g(c); } else { o(0x81); oad(0xc0 | (opc << 3) | r, c); } } else { gv2(RC_INT, RC_INT); r = vtop[-1].r; fr = vtop[0].r; o((opc << 3) | 0x01); o(0xc0 + r + fr * 8); } vtop--; if (op >= TOK_ULT && op <= TOK_GT) { vtop->r = VT_CMP; vtop->c.i = op; } break; case '-': case TOK_SUBC1: /* sub with carry generation */ opc = 5; goto gen_op8; case TOK_ADDC2: /* add with carry use */ opc = 2; goto gen_op8; case TOK_SUBC2: /* sub with carry use */ opc = 3; goto gen_op8; case '&': opc = 4; goto gen_op8; case '^': opc = 6; goto gen_op8; case '|': opc = 1; goto gen_op8; case '*': gv2(RC_INT, RC_INT); r = vtop[-1].r; fr = vtop[0].r; vtop--; o(0xaf0f); /* imul fr, r */ o(0xc0 + fr + r * 8); break; case TOK_SHL: opc = 4; goto gen_shift; case TOK_SHR: opc = 5; goto gen_shift; case TOK_SAR: opc = 7; gen_shift: opc = 0xc0 | (opc << 3); if ((vtop->r & (VT_VALMASK | VT_LVAL | VT_SYM)) == VT_CONST) { /* constant case */ vswap(); r = gv(RC_INT); vswap(); c = vtop->c.i & 0x1f; o(0xc1); /* shl/shr/sar $xxx, r */ o(opc | r); g(c); } else { /* we generate the shift in ecx */ gv2(RC_INT, RC_ECX); r = vtop[-1].r; o(0xd3); /* shl/shr/sar %cl, r */ o(opc | r); } vtop--; break; case '/': case TOK_UDIV: case TOK_PDIV: case '%': case TOK_UMOD: case TOK_UMULL: /* first operand must be in eax */ /* XXX: need better constraint for second operand */ gv2(RC_EAX, RC_ECX); r = vtop[-1].r; fr = vtop[0].r; vtop--; save_reg(TREG_EDX); if (op == TOK_UMULL) { o(0xf7); /* mul fr */ o(0xe0 + fr); vtop->r2 = TREG_EDX; r = TREG_EAX; } else { if (op == TOK_UDIV || op == TOK_UMOD) { o(0xf7d231); /* xor %edx, %edx, div fr, %eax */ o(0xf0 + fr); } else { o(0xf799); /* cltd, idiv fr, %eax */ o(0xf8 + fr); } if (op == '%' || op == TOK_UMOD) r = TREG_EDX; else r = TREG_EAX; } vtop->r = r; break; default: opc = 7; goto gen_op8; } }
/* Generate function call. The function address is pushed first, then all the parameters in call order. This functions pops all the parameters and the function address. */ void gfunc_call(int nb_args) { int size, align, r, args_size, i, func_call; Sym *func_sym; args_size = 0; for(i = 0;i < nb_args; i++) { if ((vtop->type.t & VT_BTYPE) == VT_STRUCT) { size = type_size(&vtop->type, &align); /* align to stack align size */ size = (size + 3) & ~3; /* allocate the necessary size on stack */ oad(0xec81, size); /* sub $xxx, %esp */ /* generate structure store */ r = get_reg(RC_INT); o(0x89); /* mov %esp, r */ o(0xe0 + r); vset(&vtop->type, r | VT_LVAL, 0); vswap(); vstore(); args_size += size; } else if (is_float(vtop->type.t)) { gv(RC_FLOAT); /* only one float register */ if ((vtop->type.t & VT_BTYPE) == VT_FLOAT) size = 4; else if ((vtop->type.t & VT_BTYPE) == VT_DOUBLE) size = 8; else size = 12; oad(0xec81, size); /* sub $xxx, %esp */ if (size == 12) o(0x7cdb); else o(0x5cd9 + size - 4); /* fstp[s|l] 0(%esp) */ g(0x24); g(0x00); args_size += size; } else { /* simple type (currently always same size) */ /* XXX: implicit cast ? */ r = gv(RC_INT); if ((vtop->type.t & VT_BTYPE) == VT_LLONG) { size = 8; o(0x50 + vtop->r2); /* push r */ } else { size = 4; } o(0x50 + r); /* push r */ args_size += size; } vtop--; } save_regs(0); /* save used temporary registers */ func_sym = vtop->type.ref; func_call = FUNC_CALL(func_sym->r); /* fast call case */ if ((func_call >= FUNC_FASTCALL1 && func_call <= FUNC_FASTCALL3) || func_call == FUNC_FASTCALLW) { int fastcall_nb_regs; uint8_t *fastcall_regs_ptr; if (func_call == FUNC_FASTCALLW) { fastcall_regs_ptr = fastcallw_regs; fastcall_nb_regs = 2; } else { fastcall_regs_ptr = fastcall_regs; fastcall_nb_regs = func_call - FUNC_FASTCALL1 + 1; } for(i = 0;i < fastcall_nb_regs; i++) { if (args_size <= 0) break; o(0x58 + fastcall_regs_ptr[i]); /* pop r */ /* XXX: incorrect for struct/floats */ args_size -= 4; } } gcall_or_jmp(0); if (args_size && func_call != FUNC_STDCALL) gadd_sp(args_size); vtop--; }
void apu_flt_dilate_3x3( vec08u* dst, int dstr, const vec08u* src, int sstr, int bw, int bh ) { // Structuring element: Rectangular - hardcoded // 1, 1, 1, // 1, 1, 1, // 1, 1, 1, // Neighbors: const vec08u* s0 = src - sstr; const vec08u* s1 = src; const vec08u* s2 = src + sstr; // Loop vec16s a_max, a0, a1, a2, a3, a4, a5, a6, a7, a8; vec16s b_max, b6, b7, b8; for (int y = 0; y < bh; ++y) chess_loop_range(1,) { for (int x = 0; x < bw; x+=2) chess_loop_range(1,) { // Copy src cells a0 = s0[x]; a1 = s0[x + 1]; a2 = s1[x]; a3 = s1[x + 1]; a4 = s2[x]; a5 = s2[x + 1]; a6 = s0[x - 1]; a7 = s1[x - 1]; a8 = s2[x - 1]; b6 = s0[x + 2]; b7 = s1[x + 2]; b8 = s2[x + 2]; // Compare and find max a_max = a0; vswap(a1, a_max, a1 > a_max); vswap(a2, a_max, a2 > a_max); vswap(a3, a_max, a3 > a_max); vswap(a4, a_max, a4 > a_max); vswap(a5, a_max, a5 > a_max); //save max of common pixels b_max = a_max; vswap(a6, a_max, a6 > a_max); vswap(a7, a_max, a7 > a_max); vswap(a8, a_max, a8 > a_max); vswap(b6, b_max, b6 > b_max); vswap(b7, b_max, b7 > b_max); vswap(b8, b_max, b8 > b_max); // Assign to output dst[x] = (vec08u)a_max; dst[x + 1] = (vec08u)b_max; } // Proceed to next block s0 = s1; s1 = s2; s2 += sstr; dst += dstr; } }