/*! Calculate integral images. \param dst - [Output] Destination block pointer \param dstr - [Input] Destination block stride \param src - [Input] Source block pointer \param tlst - [Input] Temporary line pointer \param sstr - [Input] Source block stride \param bw - [Input] Block width \param bh - [Input] Block height \param ncu - [Input] Number of CUs (32/64) */ void apu_integral_image(vec32u __cmem* dst, int dstr, const vec08u __cmem* src, vec32u __cmem* tlst, int sstr, int bw, int bh, int ncu) { for (int y=0; y<bh; ++y) { // Local Integration const vec08u __cmem* s = src; vec16u v0 = (vec16u)s[0]; vec16u v1 = (vec16u)s[1]; vec16u v2 = (vec16u)s[2]; vec16u v3 = (vec16u)s[3]; s += 4; vec16u ltot = 0; for (int x=4; x<bw; x+=4) chess_loop_range(1,) // at least 8 pixels { ltot += v0; v0 = (vec16u)s[0]; ltot += v1; v1 = (vec16u)s[1]; ltot += v2; v2 = (vec16u)s[2]; ltot += v3; v3 = (vec16u)s[3]; s += 4; } ltot += v0; ltot += v1; ltot += v2; ltot += v3; // Reduction vec32u ttot = (vec32u)ltot; ltot = vmsr(ltot); for (int c=1; c<ncu; ++c) { ttot += (vec32u)ltot; ltot = vmsr(ltot); } // s = src; tlst[bw-1] = dst[bw-1] = ttot + tlst[bw-1]; ttot -= (vec32u)s[bw-1]; for (int x=(bw-2); x>=0; --x) { tlst[x] = dst[x] = ttot + tlst[x]; ttot -= (vec32u)s[x]; } dst += dstr; src += sstr; } }
void neon_init(void) { uint32_t v; // F**k this took a long time to discover ... // First, need to enable access to co-processors c10 and c11 - vfp and neon v = mrc15("c1, c0, 2"); v |= 0xf<<20; mcr15("c1, c0, 2", v); // required apparently asm volatile("isb"); // Next, just need to enable NEON instructions. The ROM or uboot has already turned it on. vmsr(FPEXC, 1<<30); }