void _dv_idct_88(dv_coeff_t *block) { #if ARCH_X86_64 _dv_idct_block_mmx_x86_64(block); emms(); #elif ARCH_X86 _dv_idct_block_mmx(block); emms(); #else /* ARCH_X86 */ int v,h,y,x,i; double temp[64]; memset(temp,0,sizeof(temp)); for (v=0;v<8;v++) { for (h=0;h<8;h++) { for (y=0;y<8;y++){ for (x=0;x<8;x++) { temp[y*8+x] += C[v] * C[h] * block[v*8+h] * KC88[x][y][h][v]; } } } } for (i=0;i<64;i++) block[i] = temp[i]; #endif }
int main(int ac, char **av) { int i, j, k, n; unsigned char dat0[8] = { 0x01, 0xf2, 0x03, 0x04, 0x05, 0x06, 0xf7, 0x08 }; long long *datp = (long long *)&dat0; int16_t dat1[8] = { 0x10, 0x20, -0x130, -0x140, 0x50, -0x160, -0x170, 0x80 }; volatile uint8_t *rfp = dat0; volatile int16_t *bp = dat1; unsigned char ans1[8], ans2[8]; n = 0; for( i=-32768; i<32768; ++i ) { j = 0; while( j < 256 ) { for( k=0; k<8; ++k ) { dat0[k] = i; dat1[k] = j++; } movq_m2r(m_(&rfp[0]),mm1); /* rfp[0..7] */ pxor_r2r(mm3,mm3); pxor_r2r(mm4,mm4); movq_m2r(m_(&bp[0]),mm5); /* bp[0..3] */ movq_r2r(mm1,mm2); movq_m2r(m_(&bp[4]),mm6); /* bp[4..7] */ punpcklbw_r2r(mm3,mm1); /* rfp[0,2,4,6] */ punpckhbw_r2r(mm3,mm2); /* rfp[1,3,5,7] */ paddsw_r2r(mm5,mm1); /* bp[0..3] */ paddsw_r2r(mm6,mm2); /* bp[4..7] */ pcmpgtw_r2r(mm1,mm3); pcmpgtw_r2r(mm2,mm4); pandn_r2r(mm1,mm3); pandn_r2r(mm2,mm4); packuswb_r2r(mm4,mm3); movq_r2m(mm3,m_(&ans1[0])); emms(); ans2[0] = clip(bp[0] + rfp[0]); ans2[1] = clip(bp[1] + rfp[1]); ans2[2] = clip(bp[2] + rfp[2]); ans2[3] = clip(bp[3] + rfp[3]); ans2[4] = clip(bp[4] + rfp[4]); ans2[5] = clip(bp[5] + rfp[5]); ans2[6] = clip(bp[6] + rfp[6]); ans2[7] = clip(bp[7] + rfp[7]); if( *(uint64_t *)&ans1[0] != *(uint64_t *)&ans2[0] ) { printf(" i=%5d %02x %02x %02x %02x %02x %02x %02x %02x\n", i, ans1[0], ans1[1], ans1[2], ans1[3], ans1[4], ans1[5], ans1[6], ans1[7]); printf(" j=%5d %02x %02x %02x %02x %02x %02x %02x %02x\n", j, ans2[0], ans2[1], ans2[2], ans2[3], ans2[4], ans2[5], ans2[6], ans2[7]); // exit(0); } n += 8; } } printf("n=%d\n",n); return 0; }
/* For a 16*h block, this computes (((((*pf + *pf2 + 1)>>1) + ((*pb + *pb2 + 1)>>1) + 1)>>1) + *p2 + 1)>>1 */ static int bsad_0quad_mmxe(uint8_t *pf,uint8_t *pf2,uint8_t *pb,uint8_t *pb2,uint8_t *p2,int lx,int h) { int32_t s=0; pxor_r2r(mm7, mm7); do { movq_m2r(pf2[0],mm0); movq_m2r(pf2[8],mm2); movq_m2r(pb2[0],mm1); movq_m2r(pb2[8],mm3); pavgb_m2r(pf[0],mm0); pavgb_m2r(pf[8],mm2); pavgb_m2r(pb[0],mm1); pavgb_m2r(pb[8],mm3); pavgb_r2r(mm1,mm0); pavgb_r2r(mm3,mm2); psadbw_m2r(p2[0],mm0); psadbw_m2r(p2[8],mm2); paddd_r2r(mm0,mm7); paddd_r2r(mm2,mm7); pf+=lx; pf2+=lx; pb+=lx; pb2+=lx; p2+=lx; h--; } while (h); movd_r2g(mm7,s); emms(); return s; }
static char *status_get_totalcps(char *buffer) { int use_ticks, bufcat = 0; clock_t ticks; unsigned long time, sumtime; unsigned long long cps; double crypts, sumcrypts; unsigned cps_100; emms(); use_ticks = !status.crypts.hi && !status_restored_time; ticks = get_time() - status.start_time; if (use_ticks) time = ticks; else time = status_restored_time + ticks / clk_tck; crypts = ((long long)status.crypts.hi << 32) + status.crypts.lo; // This calculates the total cps figure (total crypts / avg run time). // It will show optimistic if the nodes don't finish at the same time MPI_Reduce(&time, &sumtime, 1, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&crypts, &sumcrypts, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); time = sumtime / mpi_p; crypts = sumcrypts; if (use_ticks) crypts *= clk_tck; cps = crypts / (time ? time : 1); if (cps >= 1000000000000LL) bufcat = sprintf(buffer, "%lluG", (cps / 1000000000)); else if (cps >= 1000000000) bufcat = sprintf(buffer, "%lluM", (cps / 1000000)); else if (cps >= 1000000) bufcat = sprintf(buffer, "%lluK", (cps / 1000)); else if (cps >= 100) bufcat = sprintf(buffer, "%llu", cps); else { cps_100 = (unsigned)((unsigned long long)(crypts * 100 / (time ? time : 1)) % 100); bufcat = sprintf(buffer, "%llu.%02u", cps, cps_100); } cps = crypts / mpi_p / (time ? time : 1); if (cps >= 1000000000000LL) sprintf(&buffer[bufcat], " avg %lluG", (cps / 1000000000)); else if (cps >= 1000000000) sprintf(&buffer[bufcat], " avg %lluM", (cps / 1000000)); else if (cps >= 1000000) sprintf(&buffer[bufcat], " avg %lluK", (cps / 1000)); else if (cps >= 100) sprintf(&buffer[bufcat], " avg %llu", cps); else { cps_100 = (unsigned)((unsigned long long)(crypts * 100 / mpi_p / (time ? time : 1)) % 100); sprintf(&buffer[bufcat], " avg%llu.%02u", cps, cps_100); } return buffer; }
static double get_progress(void) { emms(); return progress ? progress : (double)rule_number / (rule_count + 1) * 100.0; }
static void frame_i2f_sse(u_char *src,float *dst,int l) { int i; pxor_r2r(mm7,mm7); for( i=0; i<l; i+=8 ) { movq_m2r(*src,mm0); movq_r2r(mm0, mm2); punpcklbw_r2r(mm7, mm0); punpckhbw_r2r(mm7, mm2); movq_r2r(mm0, mm1); movq_r2r(mm2, mm3); punpcklwd_r2r(mm7, mm0); punpckhwd_r2r(mm7, mm1); punpcklwd_r2r(mm7, mm2); punpckhwd_r2r(mm7, mm3); cvtpi2ps_r2r(mm0,xmm0); cvtpi2ps_r2r(mm1,xmm1); cvtpi2ps_r2r(mm2,xmm2); cvtpi2ps_r2r(mm3,xmm3); movlps_r2m(xmm0,dst[0]); movlps_r2m(xmm1,dst[2]); movlps_r2m(xmm2,dst[4]); movlps_r2m(xmm3,dst[6]); src+=8; dst+=8; } emms(); }
static char *status_get_ETA(char *percent, unsigned int secs_done) { static char s_ETA[128]; char *cp; double sec_left, percent_left; time_t t_ETA; struct tm *pTm; emms(); /* Compute the ETA for this run. Assumes even run time for work currently done and work left to do, and that the CPU utilization of work done and work to do will stay same which may not always be valid assumptions */ cp = percent; while (cp && *cp && isspace(((unsigned char)(*cp)))) ++cp; if (!cp || *cp == 0 || !isdigit(((unsigned char)(*cp)))) return ""; /* dont show ETA if no valid percentage. */ else { double chk; percent_left = atof(percent); t_ETA = time(NULL); if (percent_left >= 100.0) { pTm = localtime(&t_ETA); strcpy(s_ETA, " ("); strftime(&s_ETA[2], sizeof(s_ETA)-3, timeFmt, pTm); strcat(s_ETA, ")"); return s_ETA; } if (percent_left == 0 || percent_left < ETAthreshold) return ""; /* mute ETA if too little progress */ percent_left /= 100; sec_left = secs_done; sec_left /= percent_left; sec_left -= secs_done; /* Note, many localtime() will fault if given a time_t later than Jan 19, 2038 (i.e. 0x7FFFFFFFF). We check for that here, and if so, this run will not end anyway, so simply tell user to not hold her breath */ chk = sec_left; chk += t_ETA; if (chk > 0x7FFFF000) { /* slightly less than 'max' 32 bit time_t, for safety */ strcpy(s_ETA, " (ETA: never)"); return s_ETA; } t_ETA += sec_left; pTm = localtime(&t_ETA); strcpy(s_ETA, " (ETA: "); if (sec_left < 24 * 3600) strftime(&s_ETA[7], sizeof(s_ETA)-10, timeFmt24, pTm); else strftime(&s_ETA[7], sizeof(s_ETA)-10, timeFmt, pTm); strcat(s_ETA, ")"); } return s_ETA; }
EAPI void evas_common_cpu_end_opt(void) { if (cpu_feature_mask & (CPU_FEATURE_MMX | CPU_FEATURE_MMX2)) { emms(); } }
void test_motion(const char *name, motion_func *test_func, motion_func *ref_func) { int x, y, d1, d2, it; uint8_t *ptr; int64_t ti; printf("testing '%s'\n", name); /* test correctness */ for(it=0;it<20;it++) { fill_random(img1, WIDTH * HEIGHT); fill_random(img2, WIDTH * HEIGHT); for(y=0;y<HEIGHT-17;y++) { for(x=0;x<WIDTH-17;x++) { ptr = img2 + y * WIDTH + x; d1 = test_func(img1, ptr, WIDTH); d2 = ref_func(img1, ptr, WIDTH); if (d1 != d2) { printf("error: mmx=%d c=%d\n", d1, d2); } } } } emms(); /* speed test */ ti = gettime(); d1 = 0; for(it=0;it<NB_ITS;it++) { for(y=0;y<HEIGHT-17;y++) { for(x=0;x<WIDTH-17;x++) { ptr = img2 + y * WIDTH + x; d1 += test_func(img1, ptr, WIDTH); } } } emms(); dummy = d1; /* avoid optimisation */ ti = gettime() - ti; printf(" %0.0f kop/s\n", (double)NB_ITS * (WIDTH - 16) * (HEIGHT - 16) / (double)(ti / 1000.0)); }
static double get_progress(void) { // This is a dummy function just for getting the DONE // timestamp from status.c - it will return -1 all // the time except when a mode is finished emms(); return progress; }
void _dv_dct_88(dv_coeff_t *block) { #if ((!ARCH_X86) && (!ARCH_X86_64)) #if BRUTE_FORCE_DCT_88 int v,h,y,x,i; double temp[64]; int factor = pow(2, DCT_YUV_PRECISION); memset(temp,0,sizeof(temp)); for (v = 0; v < 8; v++) { for (h = 0; h < 8; h++) { for (y = 0;y < 8; y++) { for (x = 0;x < 8; x++) { temp[v * 8 + h] += block[x * 8 + y] * KC88[x][y][h][v]; } } temp[v * 8 + h] *= (C[h] * C[v]); } } for (i = 0; i < 64; i++) { block[i] = temp[i] / factor; } #else /* BRUTE_FORCE_DCT_88 */ dct88_aan(block); postscale88(block); #endif /* BRUTE_FORCE_DCT_88 */ #elif ARCH_X86_64 _dv_dct_88_block_mmx_x86_64(block); _dv_transpose_mmx_x86_64(block); _dv_dct_88_block_mmx_x86_64(block); _dv_dct_block_mmx_x86_64_postscale_88(block, postSC88); emms(); #else /* ((!ARCH_X86) && (!ARCH_X86_64)) */ _dv_dct_88_block_mmx(block); _dv_transpose_mmx(block); _dv_dct_88_block_mmx(block); _dv_dct_block_mmx_postscale_88(block, postSC88); emms(); #endif /* ((!ARCH_X86) && (!ARCH_X86_64)) */ }
void RegisterAllocator::spillMMXcept(const OperandMMREG &r64) { for(int i = 0; i < 8; i++) { if(r64.reg != i) { spill64(i); } } emms(); }
static void deinterlace_scanline_linear_mmxext (GstDeinterlaceMethod * self, GstDeinterlace * parent, guint8 * out, GstDeinterlaceScanlineData * scanlines, gint width) { gint i; guint8 *bot = scanlines->b0, *top = scanlines->t0; for (i = width / 16; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); movq_m2r (*(bot + 8), mm2); movq_m2r (*(top + 8), mm3); movq_m2r (*(bot + 16), mm4); movq_m2r (*(top + 16), mm5); movq_m2r (*(bot + 24), mm6); movq_m2r (*(top + 24), mm7); pavgb_r2r (mm1, mm0); pavgb_r2r (mm3, mm2); pavgb_r2r (mm5, mm4); pavgb_r2r (mm7, mm6); movntq_r2m (mm0, *out); movntq_r2m (mm2, *(out + 8)); movntq_r2m (mm4, *(out + 16)); movntq_r2m (mm6, *(out + 24)); out += 32; top += 32; bot += 32; } width = (width & 0xf); for (i = width / 4; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); pavgb_r2r (mm1, mm0); movntq_r2m (mm0, *out); out += 8; top += 8; bot += 8; } width = width & 0x7; /* Handle last few pixels. */ for (i = width * 2; i; --i) { *out++ = ((*top++) + (*bot++)) >> 1; } emms (); }
int SIMD_SUFFIX(mblocks_sub44_mests)( uint8_t *blk, uint8_t *ref, int ilow,int jlow, int ihigh, int jhigh, int h, int rowstride, int threshold, me_result_s *resvec) { int32_t x,y; uint8_t *currowblk = blk; uint8_t *curblk; me_result_s *cres = resvec; int gridrowstride = rowstride; int weight; SIMD_SUFFIX(init_qblock_sad)(ref, h, rowstride); for( y=jlow; y <= jhigh ; y+=4) { curblk = currowblk; // You'd think prefetching curblk+4*rowstride would help here. // I have found *NO* measurable increase in performance... for( x = ilow; x <= ihigh; x += 4) { if( (x & 15) == (ilow & 15) ) { load_blk( curblk, rowstride, h ); curblk += 4; } weight = SIMD_SUFFIX(qblock_sad)(ref, h, rowstride); shift_blk(8); if( weight <= threshold ) { threshold = intmin(weight<<2,threshold); /* Rough and-ready absolute distance penalty */ /* NOTE: This penalty is *vital* to correct operation as otherwise the sub-mean filtering won't work on very uniform images. */ cres->weight = (uint16_t)(weight+(intmax(abs(x),abs(y))<<2)); cres->x = (uint8_t)x; cres->y = (uint8_t)y; ++cres; } } currowblk += gridrowstride; } emms(); return cres - resvec; }
static void deinterlace_line_mmx (uint8_t * dst, uint8_t * lum_m4, uint8_t * lum_m3, uint8_t * lum_m2, uint8_t * lum_m1, uint8_t * lum, int size) { mmx_t rounder; rounder.uw[0] = 4; rounder.uw[1] = 4; rounder.uw[2] = 4; rounder.uw[3] = 4; pxor_r2r (mm7, mm7); movq_m2r (rounder, mm6); for (; size > 3; size -= 4) { movd_m2r (*lum_m4, mm0); movd_m2r (*lum_m3, mm1); movd_m2r (*lum_m2, mm2); movd_m2r (*lum_m1, mm3); movd_m2r (*lum, mm4); punpcklbw_r2r (mm7, mm0); punpcklbw_r2r (mm7, mm1); punpcklbw_r2r (mm7, mm2); punpcklbw_r2r (mm7, mm3); punpcklbw_r2r (mm7, mm4); paddw_r2r (mm3, mm1); psllw_i2r (1, mm2); paddw_r2r (mm4, mm0); psllw_i2r (2, mm1); // 2 paddw_r2r (mm6, mm2); paddw_r2r (mm2, mm1); psubusw_r2r (mm0, mm1); psrlw_i2r (3, mm1); // 3 packuswb_r2r (mm7, mm1); movd_r2m (mm1, *dst); lum_m4 += 4; lum_m3 += 4; lum_m2 += 4; lum_m1 += 4; lum += 4; dst += 4; } emms (); /* Handle odd widths */ if (size > 0) deinterlace_line_c (dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size); }
static void yuy2_to_yv12_mmxext (const unsigned char *yuy2_map, int yuy2_pitch, unsigned char *y_dst, int y_dst_pitch, unsigned char *u_dst, int u_dst_pitch, unsigned char *v_dst, int v_dst_pitch, int width, int height) { #if HAVE_MMX const uint8_t *p_line1, *p_line2 = yuy2_map; uint8_t *p_y1, *p_y2 = y_dst; uint8_t *p_u = u_dst; uint8_t *p_v = v_dst; int i_x, i_y; const int i_dest_margin = y_dst_pitch - width; const int i_dest_u_margin = u_dst_pitch - width/2; const int i_dest_v_margin = v_dst_pitch - width/2; const int i_source_margin = yuy2_pitch - width*2; __asm__ __volatile__( "pcmpeqw %mm7, %mm7 \n\t" "psrlw $8, %mm7 \n\t" /* 00 ff 00 ff 00 ff 00 ff */ ); for ( i_y = height / 2 ; i_y-- ; ) { p_line1 = p_line2; p_line2 += yuy2_pitch; p_y1 = p_y2; p_y2 += y_dst_pitch; for ( i_x = width / 8 ; i_x-- ; ) { MMXEXT_YUYV_YUV420( ); } p_y2 += i_dest_margin; p_u += i_dest_u_margin; p_v += i_dest_v_margin; p_line2 += i_source_margin; } sfence(); emms(); #endif }
static PyObject* Codec_Encode( PyCodecObject* obj, PyObject *args) { //#ifndef WIN32 PyVFrameObject* cFrame = NULL; PyObject* cRes = NULL; int iLen = 0; AVFrame picture; #define ENCODE_OUTBUF_SIZE 300000 char sOutbuf[ ENCODE_OUTBUF_SIZE ]; if (!PyArg_ParseTuple(args, "O!", &VFrameType, &cFrame )) return NULL; if (!(obj->cCodec ||obj->cCodec->codec)) { PyErr_SetString(g_cErr, "Encode error:codec not initialized" ); return NULL; } //reset codec parameters if frame size is smaller than codec frame size if (!obj->cCodec->width || ! obj->cCodec->height) { PyErr_SetString(g_cErr, "Encode: zero frame size set in codec" ); return NULL; } if ((obj->cCodec->width > frame_get_width(cFrame)) || (obj->cCodec->height > frame_get_height(cFrame)) ) { PyErr_SetString(g_cErr, "Encode: cannot change resolution for frame. Use scaling first..." ); return NULL; } /* check codec params */ PyVFrame2AVFrame(cFrame, &picture, 1 ); iLen = avcodec_encode_video(obj->cCodec, sOutbuf, ENCODE_OUTBUF_SIZE, &picture); if (iLen > 0) cRes= Frame_New_LAVC_Enc( obj, sOutbuf, iLen ); else PyErr_Format(g_cErr, "Failed to encode frame( error code is %d )", iLen ); #ifdef HAVE_MMX emms(); #endif return cRes; }
static void interpolate_packed422_scanline_mmxext( uint8_t *output, uint8_t *top, uint8_t *bot, int width ) { int i; for( i = width/16; i; --i ) { movq_m2r( *bot, mm0 ); movq_m2r( *top, mm1 ); movq_m2r( *(bot + 8), mm2 ); movq_m2r( *(top + 8), mm3 ); movq_m2r( *(bot + 16), mm4 ); movq_m2r( *(top + 16), mm5 ); movq_m2r( *(bot + 24), mm6 ); movq_m2r( *(top + 24), mm7 ); pavgb_r2r( mm1, mm0 ); pavgb_r2r( mm3, mm2 ); pavgb_r2r( mm5, mm4 ); pavgb_r2r( mm7, mm6 ); movntq_r2m( mm0, *output ); movntq_r2m( mm2, *(output + 8) ); movntq_r2m( mm4, *(output + 16) ); movntq_r2m( mm6, *(output + 24) ); output += 32; top += 32; bot += 32; } width = (width & 0xf); for( i = width/4; i; --i ) { movq_m2r( *bot, mm0 ); movq_m2r( *top, mm1 ); pavgb_r2r( mm1, mm0 ); movntq_r2m( mm0, *output ); output += 8; top += 8; bot += 8; } width = width & 0x7; /* Handle last few pixels. */ for( i = width * 2; i; --i ) { *output++ = ((*top++) + (*bot++)) >> 1; } sfence(); emms(); }
static void deinterlace_scanline_linear_mmxext (GstDeinterlaceSimpleMethod * self, guint8 * out, const guint8 * bot, const guint8 * top, gint size) { gint i; for (i = size / 32; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); movq_m2r (*(bot + 8), mm2); movq_m2r (*(top + 8), mm3); movq_m2r (*(bot + 16), mm4); movq_m2r (*(top + 16), mm5); movq_m2r (*(bot + 24), mm6); movq_m2r (*(top + 24), mm7); pavgb_r2r (mm1, mm0); pavgb_r2r (mm3, mm2); pavgb_r2r (mm5, mm4); pavgb_r2r (mm7, mm6); movntq_r2m (mm0, *out); movntq_r2m (mm2, *(out + 8)); movntq_r2m (mm4, *(out + 16)); movntq_r2m (mm6, *(out + 24)); out += 32; top += 32; bot += 32; } size = (size & 0x1f); for (i = size / 8; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); pavgb_r2r (mm1, mm0); movntq_r2m (mm0, *out); out += 8; top += 8; bot += 8; } emms (); size = size & 0xf; /* Handle last few pixels. */ for (i = size; i; --i) { *out++ = ((*top++) + (*bot++)) >> 1; } }
void _dv_dct_248(dv_coeff_t *block) { #if ((!ARCH_X86) && (!ARCH_X86_64)) #if BRUTE_FORCE_DCT_248 int u,h,z,x,i; double temp[64]; int factor = pow(2, DCT_YUV_PRECISION); memset(temp,0,sizeof(temp)); for (u=0;u<4;u++) { for (h=0;h<8;h++) { for (z=0;z<4;z++) { for (x=0;x<8;x++) { temp[u*8+h] += (block[x*8+2*z] + block[x*8+(2*z+1)]) * KC248[x][z][u][h]; temp[(u+4)*8+h] += (block[x*8+2*z] - block[x*8+(2*z+1)]) * KC248[x][z][u][h]; } } temp[u*8+h] *= (C[h] * C[u]); temp[(u+4)*8+h] *= (C[h] * C[u]); } } for (i=0;i<64;i++) block[i] = temp[i] / factor; #else /* BRUTE_FORCE_DCT_248 */ dct248_aan(block); postscale248(block); #endif /* BRUTE_FORCE_DCT_248 */ #elif ARCH_X86_64 _dv_dct_88_block_mmx_x86_64(block); _dv_transpose_mmx_x86_64(block); _dv_dct_248_block_mmx_x86_64(block); _dv_dct_248_block_mmx_x86_64_post_sum(block); _dv_dct_block_mmx_x86_64_postscale_248(block, postSC248); #else /* ((!ARCH_X86) && (!ARCH_X86_64)) */ _dv_dct_88_block_mmx(block); _dv_transpose_mmx(block); _dv_dct_248_block_mmx(block); _dv_dct_248_block_mmx_post_sum(block); _dv_dct_block_mmx_postscale_248(block, postSC248); emms(); #endif /* ((!ARCH_X86) && (!ARCH_X86_64)) */ }
static inline void mean8(unsigned char *refpix,unsigned char *pixel,int radius_count,int row_stride,int threshold,int8_t *diff,unsigned char *count) { int a,b; pxor_r2r(mm6,mm6); // mm6 (aka count) = 0 pxor_r2r(mm7,mm7); // mm7 (aka diff) = 0 movq_m2r(*refpix,mm3); // mm3 = refpix[0] movd_g2r(0x80808080,mm4); // mm4 = 128 punpcklbw_r2r(mm4,mm4); pxor_r2r(mm4,mm3); // mm3 = refpix[0]-128 movd_g2r(threshold,mm5); // mm5 = threshold punpcklbw_r2r(mm5,mm5); punpcklbw_r2r(mm5,mm5); punpcklbw_r2r(mm5,mm5); for( b=0; b<radius_count; b++ ) { for( a=0; a<radius_count; a++ ) { movq_m2r(*pixel,mm0); // mm0 = pixel[0] pxor_r2r(mm4,mm0); // mm0 = pixel[0]-128 movq_r2r(mm3,mm2); // mm2 = refpix[0]-128 psubsb_r2r(mm0,mm2); // mm2 = refpix[0]-pixel[0] psubsb_r2r(mm3,mm0); // mm0 = pixel[0]-refpix[0] pminub_r2r(mm0,mm2); // mm2 = abs(pixel[0]-refpix[0]) movq_r2r(mm5,mm1); // mm1 = threshold pcmpgtb_r2r(mm2,mm1); // mm1 = (threshold > abs(pixel[0]-refpix[0])) ? -1 : 0 psubb_r2r(mm1,mm6); // mm6 += (threshold > abs(pixel[0]-refpix[0])) pand_r2r(mm1,mm0); // mm0 = (threshold > abs(pixel[0]-refpix[0])) ? pixel[0]-refpix[0] : 0 paddb_r2r(mm0,mm7); // mm7 += (threshold > abs(pixel[0]-refpix[0])) ? pixel[0]-refpix[0] : 0 ++pixel; } pixel += row_stride - radius_count; } movq_r2m(mm6,*count); movq_r2m(mm7,*diff); emms(); }
static void fast_memcpy_mmxext( void *d, const void *s, size_t n ) { const uint8_t *src = s; uint8_t *dest = d; if( dest != src ) { while( n > 64 ) { movq_m2r( src[ 0 ], mm0 ); movq_m2r( src[ 8 ], mm1 ); movq_m2r( src[ 16 ], mm2 ); movq_m2r( src[ 24 ], mm3 ); movq_m2r( src[ 32 ], mm4 ); movq_m2r( src[ 40 ], mm5 ); movq_m2r( src[ 48 ], mm6 ); movq_m2r( src[ 56 ], mm7 ); movntq_r2m( mm0, dest[ 0 ] ); movntq_r2m( mm1, dest[ 8 ] ); movntq_r2m( mm2, dest[ 16 ] ); movntq_r2m( mm3, dest[ 24 ] ); movntq_r2m( mm4, dest[ 32 ] ); movntq_r2m( mm5, dest[ 40 ] ); movntq_r2m( mm6, dest[ 48 ] ); movntq_r2m( mm7, dest[ 56 ] ); dest += 64; src += 64; n -= 64; } while( n > 8 ) { movq_m2r( src[ 0 ], mm0 ); movntq_r2m( mm0, dest[ 0 ] ); dest += 8; src += 8; n -= 8; } if( n ) small_memcpy( dest, src, n ); sfence(); emms(); } }
void RegisterAllocator::free64(int i) { bool free = (MMX[i].priority != 0); if(MMX[i].loadInstruction && loadElimination) { MMX[i].loadInstruction->reserve(); MMX[i].loadInstruction = 0; } if(MMX[i].copyInstruction && copyPropagation) { MMX[i].copyInstruction->reserve(); MMX[i].copyInstruction = 0; } MMX[i].reference = 0; MMX[i].partial = 0; MMX[i].priority = 0; if(free && autoEMMS) { for(int i = 0; i < 8; i++) { if(MMX[i].priority != 0) { return; } } // Last one freed emms(); // Completely eraze MMX allocation state for(int i = 0; i < 8; i++) { MMX[i].free(); } } }
static void frame_f2i_sse(float *src,u_char *dst,int l) { int i; // put 128 in all 4 words of mm7 movd_g2r(128,mm7); punpcklwd_r2r(mm7,mm7); punpckldq_r2r(mm7,mm7); // put 128 in all 8 bytes of mm6 movd_g2r(128,mm6); punpcklbw_r2r(mm6,mm6); punpcklwd_r2r(mm6,mm6); punpckldq_r2r(mm6,mm6); for( i=0; i<l; i+=8 ) { movaps_m2r(src[0],xmm0); movaps_m2r(src[4],xmm2); movhlps_r2r(xmm0,xmm1); cvtps2pi_r2r(xmm0,mm0); cvtps2pi_r2r(xmm1,mm1); movhlps_r2r(xmm2,xmm3); cvtps2pi_r2r(xmm2,mm2); cvtps2pi_r2r(xmm3,mm3); packssdw_r2r(mm1,mm0); packssdw_r2r(mm3,mm2); psubw_r2r(mm7,mm0); psubw_r2r(mm7,mm2); packsswb_r2r(mm2, mm0); paddb_r2r(mm6, mm0); movq_r2m(mm0,dst[0]); src+=8; dst+=8; } emms(); }
static void status_print_cracking(char *percent) { unsigned int time = status_get_time(); char *key, saved_key[PLAINTEXT_BUFFER_SIZE] = ""; char s_cps[64], cand[32] = ""; emms(); if (!(options.flags & FLG_STATUS_CHK)) if ((key = crk_get_key2())) strnzcpy(saved_key, key, PLAINTEXT_BUFFER_SIZE); if (showcand) sprintf(cand, "/%.0f", (double)((long long)status.crypts.hi << 32) + status.crypts.lo); #ifdef HAVE_MPI // we need to print until cr in one call, otherwise output gets interleaved char nodeid[11] = ""; if (mpi_p > 1) snprintf(nodeid, sizeof(nodeid), "%3d: ", mpi_id); nodeid[sizeof(nodeid)-1] = 0; char trying[256]; if ((options.flags & FLG_STATUS_CHK) || !(status.crypts.lo | status.crypts.hi)) trying[0] = 0; else { UTF8 t1buf[PLAINTEXT_BUFFER_SIZE + 1]; UTF8 t2buf[PLAINTEXT_BUFFER_SIZE + 1]; char *t1, *t2; if (options.report_utf8 && !options.utf8) { t1 = (char*)enc_to_utf8_r(crk_get_key1(), t1buf, PLAINTEXT_BUFFER_SIZE); t2 = (char*)enc_to_utf8_r(saved_key, t2buf, PLAINTEXT_BUFFER_SIZE); } else { t1 = crk_get_key1(); t2 = saved_key; } snprintf(trying, sizeof(trying), "%strying: %s%s%s", mpi_p > 1 ? " " : " ", t1, t2[0] ? " - " : "", t2); } fprintf(stderr, "%s" "guesses: %u%s%s" "time: %u:%02u:%02u:%02u" "%s%s%s" "c/s: %s" "%s\n", nodeid, status.guess_count, cand, mpi_p > 1 ? " " : " ", time / 86400, time % 86400 / 3600, time % 3600 / 60, time % 60, strncmp(percent, " 100", 4) ? percent : " DONE", status_get_ETA(percent,time), mpi_p > 1 ? " " : " ", status_get_cps(s_cps), trying); #else fprintf(stderr, "guesses: %u%s " "time: %u:%02u:%02u:%02u" "%s%s " "c/s: %s", status.guess_count, cand, time / 86400, time % 86400 / 3600, time % 3600 / 60, time % 60, strncmp(percent, " 100", 4) ? percent : " DONE", status_get_ETA(percent,time), status_get_cps(s_cps)); if ((options.flags & FLG_STATUS_CHK) || !(status.crypts.lo | status.crypts.hi)) fputc('\n', stderr); else { UTF8 t1buf[PLAINTEXT_BUFFER_SIZE + 1]; UTF8 t2buf[PLAINTEXT_BUFFER_SIZE + 1]; char *t1, *t2; if (options.report_utf8 && !options.utf8) { t1 = (char*)enc_to_utf8_r(crk_get_key1(), t1buf, PLAINTEXT_BUFFER_SIZE); t2 = (char*)enc_to_utf8_r(saved_key, t2buf, PLAINTEXT_BUFFER_SIZE); } else { t1 = crk_get_key1(); t2 = saved_key; } fprintf(stderr, " trying: %s%s%s\n", t1, t2[0] ? " - " : "", t2); } #endif }
static char *status_get_totalETA(char *percent, unsigned int secs_done) { static char s_ETA[128]; char *cp; double sec_left, percent_left, max_sec_left; time_t t_ETA; struct tm *pTm; emms(); cp = percent; while (cp && *cp && isspace(*cp)) ++cp; if (!cp || *cp == 0 || !isdigit(*cp)) { // We must report to MPI_Allreduce anyway sec_left = 0; MPI_Allreduce(&sec_left, &max_sec_left, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); return ""; /* dont show ETA if no valid percentage. */ } else { double chk; percent_left = atof(percent); t_ETA = time(NULL); if (percent_left >= 100.0) { // We must report to MPI_Allreduce anyway sec_left = 0; MPI_Allreduce(&sec_left, &max_sec_left, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); pTm = localtime(&t_ETA); strcpy(s_ETA, " ("); strftime(&s_ETA[2], sizeof(s_ETA)-3, timeformat, pTm); strcat(s_ETA, ")"); return s_ETA; } if (percent_left == 0 || percent_left < ETAthreshold) { // We must report to MPI_Allreduce anyway sec_left = 0; MPI_Allreduce(&sec_left, &max_sec_left, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); return ""; /* mute ETA if too little progress */ } percent_left /= 100; sec_left = secs_done; sec_left /= percent_left; sec_left -= secs_done; // Reports the worst ETA for all nodes MPI_Allreduce(&sec_left, &max_sec_left, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); sec_left = max_sec_left; chk = sec_left; chk += t_ETA; if (chk > 0x7FFFF000) { /* slightly less than 'max' 32 bit time_t, for safety */ strcpy(s_ETA, " (ETA: never)"); return s_ETA; } t_ETA += sec_left; pTm = localtime(&t_ETA); strcpy(s_ETA, " (ETA: "); strftime(&s_ETA[7], sizeof(s_ETA)-10, timeformat, pTm); strcat(s_ETA, ")"); } return s_ETA; }
static void deinterlace_scanline_linear_mmx (GstDeinterlaceSimpleMethod * self, guint8 * out, const guint8 * bot, const guint8 * top, gint size) { const mmx_t shiftmask = { 0xfefffefffefffeffULL }; /* To avoid shifting chroma to luma. */ int i; for (i = size / 32; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); movq_m2r (*(bot + 8), mm2); movq_m2r (*(top + 8), mm3); movq_m2r (*(bot + 16), mm4); movq_m2r (*(top + 16), mm5); movq_m2r (*(bot + 24), mm6); movq_m2r (*(top + 24), mm7); pand_m2r (shiftmask, mm0); pand_m2r (shiftmask, mm1); pand_m2r (shiftmask, mm2); pand_m2r (shiftmask, mm3); pand_m2r (shiftmask, mm4); pand_m2r (shiftmask, mm5); pand_m2r (shiftmask, mm6); pand_m2r (shiftmask, mm7); psrlw_i2r (1, mm0); psrlw_i2r (1, mm1); psrlw_i2r (1, mm2); psrlw_i2r (1, mm3); psrlw_i2r (1, mm4); psrlw_i2r (1, mm5); psrlw_i2r (1, mm6); psrlw_i2r (1, mm7); paddb_r2r (mm1, mm0); paddb_r2r (mm3, mm2); paddb_r2r (mm5, mm4); paddb_r2r (mm7, mm6); movq_r2m (mm0, *out); movq_r2m (mm2, *(out + 8)); movq_r2m (mm4, *(out + 16)); movq_r2m (mm6, *(out + 24)); out += 32; top += 32; bot += 32; } size = (size & 0x1f); for (i = size / 8; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); pand_m2r (shiftmask, mm0); pand_m2r (shiftmask, mm1); psrlw_i2r (1, mm0); psrlw_i2r (1, mm1); paddb_r2r (mm1, mm0); movq_r2m (mm0, *out); out += 8; top += 8; bot += 8; } emms (); size = size & 0xf; /* Handle last few pixels. */ for (i = size; i; --i) { *out++ = ((*top++) + (*bot++)) >> 1; } }
int main() { int rval; mmx_t ma; mmx_t mb; movq_r2r(mm0, mm1); rval = mmx_ok(); /* Announce return value of mmx_ok() */ // printf("Value returned from init was %x.", rval); // printf(" (Indicates MMX %s available)\n\n",(rval)? "is" : "not"); // fflush(stdout); fflush(stderr); // if(rval) { /* PADD *****************************************************/ ma.q = 0x1111111180000000LL; mb.q = 0x7fffffff00000001LL; paddd(ma, mb); fprintf(stdout, "paddd: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddd: mb.q is 9111111080000001\n"); fflush(stdout); fflush(stderr); ma.q = 0x0001000100010001LL; mb.q = 0x80007fffffff0001LL; paddw(ma, mb); fprintf(stdout, "paddw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddw: mb.q is 8001800000000002\n"); fflush(stdout); fflush(stderr); ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010000LL; paddw(ma, mb); fprintf(stdout, "paddw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddw: mb.q is 8001800000000001\n"); fflush(stdout); fflush(stderr); ma.q = 0x01010101807fff01LL; mb.q = 0x807fff0101010101LL; paddb(ma, mb); fprintf(stdout, "paddb: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddb: mb.q is 8180000281800002\n"); fflush(stdout); fflush(stderr); /* PADDS ****************************************************/ ma.q = 0x0001000100010001LL; mb.q = 0x80007fffffff0001LL; paddsw(ma, mb); fprintf(stdout, "paddsw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddsw: mb.q is 80017fff00000002\n"); ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010000LL; paddsw(ma, mb); fprintf(stdout, "paddsw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddsw: mb.q is 80017fff00000001\n"); ma.q = 0x01010101807fff01LL; mb.q = 0x807fff0101010101LL; paddsb(ma, mb); fprintf(stdout, "paddsb: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddsb: mb.q is 817f0002817f0002\n"); fflush(stdout); fflush(stderr); /* PADDUS ***************************************************/ ma.q = 0x0001000100010001LL; mb.q = 0x80007fffffff0001LL; paddusw(ma, mb); fprintf(stdout, "paddusw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddusw: mb.q is 80018000ffff0002\n"); fflush(stdout); fflush(stderr); ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010000LL; paddusw(ma, mb); fprintf(stdout, "paddusw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddusw: mb.q is 80018000ffff0001\n"); fflush(stdout); fflush(stderr); ma.q = 0x01010101807fff01LL; mb.q = 0x807fff0101010101LL; paddusb(ma, mb); fprintf(stdout, "paddusb: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddusb: mb.q is 8180ff028180ff02\n"); fflush(stdout); fflush(stderr); /* PSUB *****************************************************/ ma.q = 0x7fffffff00000001LL; mb.q = 0x1111111180000000LL; psubd(ma, mb); fprintf(stdout, "psubd: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubd: mb.q is 911111127fffffff\n"); fflush(stdout); fflush(stderr); ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010001LL; psubw(ma, mb); fprintf(stdout, "psubw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubw: mb.q is 8001800200020000\n"); fflush(stdout); fflush(stderr); ma.q = 0x0001000100010000LL; mb.q = 0x80007fffffff0001LL; psubw(ma, mb); fprintf(stdout, "psubw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubw: mb.q is 7fff7ffefffe0001\n"); fflush(stdout); fflush(stderr); ma.q = 0x807fff0101010101LL; mb.q = 0x01010101807fff01LL; psubb(ma, mb); fprintf(stdout, "psubb: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubb: mb.q is 818202007f7efe00\n"); fflush(stdout); fflush(stderr); /* PSUBS ****************************************************/ ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010001LL; psubsw(ma, mb); fprintf(stdout, "psubsw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubsw: mb.q is 7fff800200020000\n"); fflush(stdout); fflush(stderr); ma.q = 0x0001000100010000LL; mb.q = 0x80007fffffff0001LL; psubsw(ma, mb); fprintf(stdout, "psubsw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubsw: mb.q is 80007ffefffe0001\n"); fflush(stdout); fflush(stderr); ma.q = 0x807fff0101010101LL; mb.q = 0x01010101807fff01LL; psubsb(ma, mb); fprintf(stdout, "psubsb: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubsb: mb.q is 7f820200807efe00\n"); fflush(stdout); fflush(stderr); /* PSUBUS ***************************************************/ ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010001LL; psubusw(ma, mb); fprintf(stdout, "psubusw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubusw: mb.q is 0000000000000000\n"); fflush(stdout); fflush(stderr); ma.q = 0x0001000100010000LL; mb.q = 0x80007fffffff0001LL; psubusw(ma, mb); fprintf(stdout, "psubusw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubusw: mb.q is 7fff7ffefffe0001\n"); fflush(stdout); fflush(stderr); ma.q = 0x807fff0101010101LL; mb.q = 0x01010101807fff01LL; psubusb(ma, mb); fprintf(stdout, "psubusb: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubusb: mb.q is 000000007f7efe00\n"); fflush(stdout); fflush(stderr); /* PMUL *****************************************************/ ma.q = 0x8000ffff00ff0000LL; mb.q = 0x0200ffff00ffffffLL; pmulhw(ma, mb); fprintf(stdout, "pmulhw: mb.q is %016llx\n", mb.q); fprintf(stderr, "pmulhw: mb.q is ff00000000000000\n"); fflush(stdout); fflush(stderr); mb.q = 0x0200ffff00ffffffLL; pmullw(ma, mb); fprintf(stdout, "pmullw: mb.q is %016llx\n", mb.q); fprintf(stderr, "pmullw: mb.q is 00000001fe010000\n"); fflush(stdout); fflush(stderr); /* PMADD ****************************************************/ ma.q = 0x8000345680007f34LL; mb.q = 0x93234a27ffff1707LL; pmaddwd(ma, mb); fprintf(stdout, "pmaddwd: mb.q is %016llx\n", mb.q); fprintf(stderr, "pmaddwd: mb.q is 4597551a0b71a66c\n"); fflush(stdout); fflush(stderr); /* PCMPEQ ***************************************************/ ma.q = 0x800034568f237f34LL; mb.q = 0x93009a568f237f34LL; pcmpeqd(ma, mb); fprintf(stdout, "pcmpeqd: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpeqd: mb.q is 00000000ffffffff\n"); fflush(stdout); fflush(stderr); mb.q = 0x93009a568f237f34LL; pcmpeqw(ma, mb); fprintf(stdout, "pcmpeqw: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpeqw: mb.q is 00000000ffffffff\n"); fflush(stdout); fflush(stderr); mb.q = 0x93009a568f237f34LL; pcmpeqb(ma, mb); fprintf(stdout, "pcmpeqb: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpeqb: mb.q is 00ff00ffffffffff\n"); fflush(stdout); fflush(stderr); /* PCMPGT ***************************************************/ ma.q = 0x666688884477aaffLL; mb.q = 0x1234567890abcdefLL; pcmpgtd(ma, mb); fprintf(stdout, "pcmpgtd: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpgtd: mb.q is 0000000000000000\n"); fflush(stdout); fflush(stderr); mb.q = 0x1234567890abcdefLL; pcmpgtw(ma, mb); fprintf(stdout, "pcmpgtw: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpgtw: mb.q is 0000ffff0000ffff\n"); fflush(stdout); fflush(stderr); mb.q = 0x1234567890abcdefLL; pcmpgtb(ma, mb); fprintf(stdout, "pcmpgtb: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpgtb: mb.q is 0000ffff0000ff00\n"); fflush(stdout); fflush(stderr); /* PACKSS ***************************************************/ ma.q = 0x00012222000abbbbLL; mb.q = 0x0000888800003333LL; packssdw(ma, mb); fprintf(stdout, "packssdw: mb.q is %016llx\n", mb.q); fprintf(stderr, "packssdw: mb.q is 7fff7fff7fff3333\n"); fflush(stdout); fflush(stderr); ma.q = 0x00aa00dd01009999LL; mb.q = 0x0011002200330044LL; packsswb(ma, mb); fprintf(stdout, "packsswb: mb.q is %016llx\n", mb.q); fprintf(stderr, "packsswb: mb.q is 7f7f7f8011223344\n"); fflush(stdout); fflush(stderr); /* PACKUS ***************************************************/ ma.q = 0x00aa00dd01009999LL; mb.q = 0x0011002200330044LL; packuswb(ma, mb); fprintf(stdout, "packuswb: mb.q is %016llx\n", mb.q); fprintf(stderr, "packuswb: mb.q is aaddff0011223344\n"); fflush(stdout); fflush(stderr); /* PUNPCKH **************************************************/ ma.q = 0x090a0b0c0d0e0f00LL; mb.q = 0x0102030405060708LL; punpckhdq(ma, mb); fprintf(stdout, "punpckhdq: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpckhdq: mb.q is 090a0b0c01020304\n"); fflush(stdout); fflush(stderr); mb.q = 0x0102030405060708LL; punpckhwd(ma, mb); fprintf(stdout, "punpckhwd: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpckhwd: mb.q is 090a01020b0c0304\n"); fflush(stdout); fflush(stderr); mb.q = 0x0102030405060708LL; punpckhbw(ma, mb); fprintf(stdout, "punpckhbw: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpckhbw: mb.q is 09010a020b030c04\n"); fflush(stdout); fflush(stderr); /* PUNPCKL **************************************************/ ma.q = 0x090a0b0c0d0e0f00LL; mb.q = 0x0102030405060708LL; punpckldq(ma, mb); fprintf(stdout, "punpckldq: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpckldq: mb.q is 0d0e0f0005060708\n"); fflush(stdout); fflush(stderr); mb.q = 0x0102030405060708LL; punpcklwd(ma, mb); fprintf(stdout, "punpcklwd: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpcklwd: mb.q is 0d0e05060f000708\n"); fflush(stdout); fflush(stderr); mb.q = 0x0102030405060708LL; punpcklbw(ma, mb); fprintf(stdout, "punpcklbw: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpcklbw: mb.q is 0d050e060f070008\n"); fflush(stdout); fflush(stderr); /* PAND, PANDN, POR, PXOR ***********************************/ ma.q = 0x5555555555555555LL; mb.q = 0x3333333333333333LL; pand(ma, mb); fprintf(stdout, "pand: mb.q is %016llx\n", mb.q); fprintf(stderr, "pand: mb.q is 1111111111111111\n"); fflush(stdout); fflush(stderr); mb.q = 0x3333333333333333LL; pandn(ma, mb); fprintf(stdout, "pandn: mb.q is %016llx\n", mb.q); fprintf(stderr, "pandn: mb.q is 4444444444444444\n"); fflush(stdout); fflush(stderr); mb.q = 0x3333333333333333LL; por(ma, mb); fprintf(stdout, "por: mb.q is %016llx\n", mb.q); fprintf(stderr, "por: mb.q is 7777777777777777\n"); fflush(stdout); fflush(stderr); mb.q = 0x3333333333333333LL; pxor(ma, mb); fprintf(stdout, "pxor: mb.q is %016llx\n", mb.q); fprintf(stderr, "pxor: mb.q is 6666666666666666\n"); fflush(stdout); fflush(stderr); /* PSLL *****************************************************/ ma.q = 0x0000000000000018LL; mb.q = 0x0123456789abcdefLL; psllq(ma, mb); fprintf(stdout, "psllq: mb.q is %016llx\n", mb.q); fprintf(stderr, "psllq: mb.q is 6789abcdef000000\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; pslld(ma, mb); fprintf(stdout, "pslld: mb.q is %016llx\n", mb.q); fprintf(stderr, "pslld: mb.q is 67000000ef000000\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; psllw(ma, mb); fprintf(stdout, "psllw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psllw: mb.q is 0000000000000000\n"); fflush(stdout); fflush(stderr); /* PSRL *****************************************************/ ma.q = 0x0000000000000018LL; mb.q = 0x0123456789abcdefLL; psrlq(ma, mb); fprintf(stdout, "psrlq: mb.q is %016llx\n", mb.q); fprintf(stderr, "psrlq: mb.q is 0000000123456789\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; psrld(ma, mb); fprintf(stdout, "psrld: mb.q is %016llx\n", mb.q); fprintf(stderr, "psrld: mb.q is 0000000100000089\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; psrlw(ma, mb); fprintf(stdout, "psrlw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psrlw: mb.q is 0000000000000000\n"); fflush(stdout); fflush(stderr); /* PSRA *****************************************************/ ma.q = 0x0000000000000018LL; mb.q = 0x0123456789abcdefLL; psrad(ma, mb); fprintf(stdout, "psrad: mb.q is %016llx\n", mb.q); fprintf(stderr, "psrad: mb.q is 00000001ffffff89\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; psraw(ma, mb); fprintf(stdout, "psraw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psraw: mb.q is 00000000ffffffff\n"); fflush(stdout); fflush(stderr); /* Exit MXX *************************************************/ emms(); } /* Clean-up and exit nicely */ exit(0); }
VLC_MMX static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top, const picture_t* p_pic_bot ) { assert( p_pic_top->i_planes == p_pic_bot->i_planes ); /* Amount of bits must be known for MMX, thus int32_t. Doesn't hurt the C implementation. */ int32_t i_score_mmx = 0; /* this must be divided by 255 when finished */ int32_t i_score_c = 0; /* this counts as-is (used for non-MMX parts) */ pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */ for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane ) { /* Sanity check */ if( p_pic_top->p[i_plane].i_visible_lines != p_pic_bot->p[i_plane].i_visible_lines ) return -1; const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1; const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch, p_pic_bot->p[i_plane].i_visible_pitch ); const int wm8 = w % 8; /* remainder */ const int w8 = w - wm8; /* part of width that is divisible by 8 */ /* Current line / neighbouring lines picture pointers */ const picture_t *cur = p_pic_bot; const picture_t *ngh = p_pic_top; int wc = cur->p[i_plane].i_pitch; int wn = ngh->p[i_plane].i_pitch; /* Transcode 1.1.5 only checks every other line. Checking every line works better for anime, which may contain horizontal, one pixel thick cartoon outlines. */ for( int y = 1; y < i_lasty; ++y ) { uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc]; /* this line */ uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */ uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */ int x = 0; /* Easy-to-read C version further below. Assumptions: 0 < T < 127 # of pixels < (2^32)/255 Note: calculates score * 255 */ static alignas (8) const mmx_t b0 = { .uq = 0x0000000000000000ULL }; static alignas (8) const mmx_t b128 = { .uq = 0x8080808080808080ULL }; static alignas (8) const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } }; for( ; x < w8; x += 8 ) { movq_m2r( *((int64_t*)p_c), mm0 ); movq_m2r( *((int64_t*)p_p), mm1 ); movq_m2r( *((int64_t*)p_n), mm2 ); psubb_m2r( b128, mm0 ); psubb_m2r( b128, mm1 ); psubb_m2r( b128, mm2 ); psubsb_r2r( mm0, mm1 ); psubsb_r2r( mm0, mm2 ); pxor_r2r( mm3, mm3 ); pxor_r2r( mm4, mm4 ); pxor_r2r( mm5, mm5 ); pxor_r2r( mm6, mm6 ); punpcklbw_r2r( mm1, mm3 ); punpcklbw_r2r( mm2, mm4 ); punpckhbw_r2r( mm1, mm5 ); punpckhbw_r2r( mm2, mm6 ); pmulhw_r2r( mm3, mm4 ); pmulhw_r2r( mm5, mm6 ); packsswb_r2r(mm4, mm6); pcmpgtb_m2r( bT, mm6 ); psadbw_m2r( b0, mm6 ); paddd_r2r( mm6, mm7 ); p_c += 8; p_p += 8; p_n += 8; } for( ; x < w; ++x ) { /* Worst case: need 17 bits for "comb". */ int_fast32_t C = *p_c; int_fast32_t P = *p_p; int_fast32_t N = *p_n; /* Comments in Transcode's filter_ivtc.c attribute this combing metric to Gunnar Thalin. The idea is that if the picture is interlaced, both expressions will have the same sign, and this comes up positive. The value T = 100 has been chosen such that a pixel difference of 10 (on average) will trigger the detector. */ int_fast32_t comb = (P - C) * (N - C); if( comb > T ) ++i_score_c; ++p_c; ++p_p; ++p_n; } /* Now the other field - swap current and neighbour pictures */ const picture_t *tmp = cur; cur = ngh; ngh = tmp; int tmp_pitch = wc; wc = wn; wn = tmp_pitch; } } movd_r2m( mm7, i_score_mmx ); emms(); return i_score_mmx/255 + i_score_c; } #endif /* See header for function doc. */ int CalculateInterlaceScore( const picture_t* p_pic_top, const picture_t* p_pic_bot ) { /* We use the comb metric from the IVTC filter of Transcode 1.1.5. This was found to work better for the particular purpose of IVTC than RenderX()'s comb metric. Note that we *must not* subsample at all in order to catch interlacing in telecined frames with localized motion (e.g. anime with characters talking, where only mouths move and everything else stays still.) */ assert( p_pic_top != NULL ); assert( p_pic_bot != NULL ); if( p_pic_top->i_planes != p_pic_bot->i_planes ) return -1; #ifdef CAN_COMPILE_MMXEXT if (vlc_CPU_MMXEXT()) return CalculateInterlaceScoreMMX( p_pic_top, p_pic_bot ); #endif int32_t i_score = 0; for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane ) { /* Sanity check */ if( p_pic_top->p[i_plane].i_visible_lines != p_pic_bot->p[i_plane].i_visible_lines ) return -1; const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1; const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch, p_pic_bot->p[i_plane].i_visible_pitch ); /* Current line / neighbouring lines picture pointers */ const picture_t *cur = p_pic_bot; const picture_t *ngh = p_pic_top; int wc = cur->p[i_plane].i_pitch; int wn = ngh->p[i_plane].i_pitch; /* Transcode 1.1.5 only checks every other line. Checking every line works better for anime, which may contain horizontal, one pixel thick cartoon outlines. */ for( int y = 1; y < i_lasty; ++y ) { uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc]; /* this line */ uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */ uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */ for( int x = 0; x < w; ++x ) { /* Worst case: need 17 bits for "comb". */ int_fast32_t C = *p_c; int_fast32_t P = *p_p; int_fast32_t N = *p_n; /* Comments in Transcode's filter_ivtc.c attribute this combing metric to Gunnar Thalin. The idea is that if the picture is interlaced, both expressions will have the same sign, and this comes up positive. The value T = 100 has been chosen such that a pixel difference of 10 (on average) will trigger the detector. */ int_fast32_t comb = (P - C) * (N - C); if( comb > T ) ++i_score; ++p_c; ++p_p; ++p_n; } /* Now the other field - swap current and neighbour pictures */ const picture_t *tmp = cur; cur = ngh; ngh = tmp; int tmp_pitch = wc; wc = wn; wn = tmp_pitch; } } return i_score; }
VLC_MMX static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c, int i_pitch_prev, int i_pitch_curr, int* pi_top, int* pi_bot ) { int32_t i_motion = 0; int32_t i_top_motion = 0; int32_t i_bot_motion = 0; static alignas (8) const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } }; pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */ movq_m2r( bT, mm5 ); pxor_r2r( mm3, mm3 ); /* score (top field) */ pxor_r2r( mm4, mm4 ); /* score (bottom field) */ for( int y = 0; y < 8; y+=2 ) { /* top field */ movq_m2r( *((uint64_t*)p_pix_c), mm0 ); movq_m2r( *((uint64_t*)p_pix_p), mm1 ); movq_r2r( mm0, mm2 ); psubusb_r2r( mm1, mm2 ); psubusb_r2r( mm0, mm1 ); pcmpgtb_r2r( mm5, mm2 ); pcmpgtb_r2r( mm5, mm1 ); psadbw_r2r( mm6, mm2 ); psadbw_r2r( mm6, mm1 ); paddd_r2r( mm2, mm1 ); paddd_r2r( mm1, mm3 ); /* add to top field score */ p_pix_c += i_pitch_curr; p_pix_p += i_pitch_prev; /* bottom field - handling identical to top field, except... */ movq_m2r( *((uint64_t*)p_pix_c), mm0 ); movq_m2r( *((uint64_t*)p_pix_p), mm1 ); movq_r2r( mm0, mm2 ); psubusb_r2r( mm1, mm2 ); psubusb_r2r( mm0, mm1 ); pcmpgtb_r2r( mm5, mm2 ); pcmpgtb_r2r( mm5, mm1 ); psadbw_r2r( mm6, mm2 ); psadbw_r2r( mm6, mm1 ); paddd_r2r( mm2, mm1 ); paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */ p_pix_c += i_pitch_curr; p_pix_p += i_pitch_prev; } movq_r2r( mm3, mm7 ); /* score (total) */ paddd_r2r( mm4, mm7 ); movd_r2m( mm3, i_top_motion ); movd_r2m( mm4, i_bot_motion ); movd_r2m( mm7, i_motion ); /* The loop counts actual score * 255. */ i_top_motion /= 255; i_bot_motion /= 255; i_motion /= 255; emms(); (*pi_top) = ( i_top_motion >= 8 ); (*pi_bot) = ( i_bot_motion >= 8 ); return (i_motion >= 8); }