Exemple #1
0
void _dv_idct_88(dv_coeff_t *block) 
{
#if ARCH_X86_64

  _dv_idct_block_mmx_x86_64(block);
  emms();

#elif ARCH_X86

  _dv_idct_block_mmx(block);
  emms();

#else /* ARCH_X86 */

  int v,h,y,x,i;
  double temp[64];

  memset(temp,0,sizeof(temp));
  for (v=0;v<8;v++) {
    for (h=0;h<8;h++) {
      for (y=0;y<8;y++){ 
	for (x=0;x<8;x++) {
	  temp[y*8+x] += C[v] * C[h] * block[v*8+h] * KC88[x][y][h][v];
	}
      }
    }
  }
	
  for (i=0;i<64;i++)
    block[i] = temp[i];
#endif
}
Exemple #2
0
int main(int ac, char **av)
{
   int i, j, k, n;
   unsigned char dat0[8] = { 0x01, 0xf2, 0x03, 0x04, 0x05, 0x06, 0xf7, 0x08 };
   long long *datp = (long long *)&dat0;
   int16_t dat1[8] = { 0x10, 0x20, -0x130, -0x140, 0x50, -0x160, -0x170, 0x80 };
   volatile uint8_t *rfp = dat0;
   volatile int16_t *bp  = dat1;
   unsigned char ans1[8], ans2[8];

   n = 0;
   for( i=-32768; i<32768; ++i ) {
     j = 0;
     while( j < 256 ) {
        for( k=0; k<8; ++k ) {
          dat0[k] = i;
          dat1[k] = j++;
        }
       movq_m2r(m_(&rfp[0]),mm1);  /* rfp[0..7] */
       pxor_r2r(mm3,mm3);
       pxor_r2r(mm4,mm4);
       movq_m2r(m_(&bp[0]),mm5);   /* bp[0..3] */
       movq_r2r(mm1,mm2);
       movq_m2r(m_(&bp[4]),mm6);   /* bp[4..7] */
       punpcklbw_r2r(mm3,mm1);     /* rfp[0,2,4,6] */
       punpckhbw_r2r(mm3,mm2);     /* rfp[1,3,5,7] */
       paddsw_r2r(mm5,mm1);        /* bp[0..3] */
       paddsw_r2r(mm6,mm2);        /* bp[4..7] */
       pcmpgtw_r2r(mm1,mm3);
       pcmpgtw_r2r(mm2,mm4);
       pandn_r2r(mm1,mm3);
       pandn_r2r(mm2,mm4);
       packuswb_r2r(mm4,mm3);
       movq_r2m(mm3,m_(&ans1[0]));
       emms();

       ans2[0] = clip(bp[0] + rfp[0]);
       ans2[1] = clip(bp[1] + rfp[1]);
       ans2[2] = clip(bp[2] + rfp[2]);
       ans2[3] = clip(bp[3] + rfp[3]);
       ans2[4] = clip(bp[4] + rfp[4]);
       ans2[5] = clip(bp[5] + rfp[5]);
       ans2[6] = clip(bp[6] + rfp[6]);
       ans2[7] = clip(bp[7] + rfp[7]);

       if( *(uint64_t *)&ans1[0] != *(uint64_t *)&ans2[0] )
       {
         printf(" i=%5d %02x %02x %02x %02x  %02x %02x %02x %02x\n", i,
           ans1[0], ans1[1], ans1[2], ans1[3], ans1[4], ans1[5], ans1[6], ans1[7]);
         printf(" j=%5d %02x %02x %02x %02x  %02x %02x %02x %02x\n", j,
           ans2[0], ans2[1], ans2[2], ans2[3], ans2[4], ans2[5], ans2[6], ans2[7]);
       //  exit(0);
       }
       n += 8;
     }
   }

   printf("n=%d\n",n);
   return 0;
}
Exemple #3
0
/* For a 16*h block, this computes
   (((((*pf + *pf2 + 1)>>1) + ((*pb + *pb2 + 1)>>1) + 1)>>1) + *p2 + 1)>>1
*/
static int bsad_0quad_mmxe(uint8_t *pf,uint8_t *pf2,uint8_t *pb,uint8_t *pb2,uint8_t *p2,int lx,int h)
{
    int32_t s=0;

    pxor_r2r(mm7, mm7);
    do {
        movq_m2r(pf2[0],mm0);
        movq_m2r(pf2[8],mm2);
        movq_m2r(pb2[0],mm1);
        movq_m2r(pb2[8],mm3);
        pavgb_m2r(pf[0],mm0);
        pavgb_m2r(pf[8],mm2);
        pavgb_m2r(pb[0],mm1);
        pavgb_m2r(pb[8],mm3);
        pavgb_r2r(mm1,mm0);
        pavgb_r2r(mm3,mm2);
        psadbw_m2r(p2[0],mm0);
        psadbw_m2r(p2[8],mm2);
        paddd_r2r(mm0,mm7);
        paddd_r2r(mm2,mm7);

        pf+=lx;
        pf2+=lx;
        pb+=lx;
        pb2+=lx;
        p2+=lx;

        h--;
    } while (h);
    movd_r2g(mm7,s);
    emms();
    return s;
}
Exemple #4
0
static char *status_get_totalcps(char *buffer)
{
    int use_ticks, bufcat = 0;
    clock_t ticks;
    unsigned long time, sumtime;
    unsigned long long cps;
    double crypts, sumcrypts;
    unsigned cps_100;

    emms();

    use_ticks = !status.crypts.hi && !status_restored_time;

    ticks = get_time() - status.start_time;
    if (use_ticks)
        time = ticks;
    else
        time = status_restored_time + ticks / clk_tck;

    crypts = ((long long)status.crypts.hi << 32) + status.crypts.lo;

    // This calculates the total cps figure (total crypts / avg run time).
    // It will show optimistic if the nodes don't finish at the same time
    MPI_Reduce(&time, &sumtime, 1, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Reduce(&crypts, &sumcrypts, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    time = sumtime / mpi_p;
    crypts = sumcrypts;

    if (use_ticks) crypts *= clk_tck;
    cps = crypts / (time ? time : 1);

    if (cps >= 1000000000000LL)
        bufcat = sprintf(buffer, "%lluG", (cps / 1000000000));
    else if (cps >= 1000000000)
        bufcat = sprintf(buffer, "%lluM", (cps / 1000000));
    else if (cps >= 1000000)
        bufcat = sprintf(buffer, "%lluK", (cps / 1000));
    else if (cps >= 100)
        bufcat = sprintf(buffer, "%llu", cps);
    else {
        cps_100 = (unsigned)((unsigned long long)(crypts * 100 / (time ? time : 1)) % 100);
        bufcat = sprintf(buffer, "%llu.%02u", cps, cps_100);
    }

    cps = crypts / mpi_p / (time ? time : 1);

    if (cps >= 1000000000000LL)
        sprintf(&buffer[bufcat], " avg %lluG", (cps / 1000000000));
    else if (cps >= 1000000000)
        sprintf(&buffer[bufcat], " avg %lluM", (cps / 1000000));
    else if (cps >= 1000000)
        sprintf(&buffer[bufcat], " avg %lluK", (cps / 1000));
    else if (cps >= 100)
        sprintf(&buffer[bufcat], " avg %llu", cps);
    else {
        cps_100 = (unsigned)((unsigned long long)(crypts * 100 / mpi_p / (time ? time : 1)) % 100);
        sprintf(&buffer[bufcat], " avg%llu.%02u", cps, cps_100);
    }
    return buffer;
}
Exemple #5
0
static double get_progress(void)
{
    emms();

    return progress ? progress :
           (double)rule_number / (rule_count + 1) * 100.0;
}
static void frame_i2f_sse(u_char *src,float *dst,int l)
{
    int i;

    pxor_r2r(mm7,mm7);

    for( i=0; i<l; i+=8 ) {
        movq_m2r(*src,mm0);
        movq_r2r(mm0, mm2);
        punpcklbw_r2r(mm7, mm0);
        punpckhbw_r2r(mm7, mm2);
        movq_r2r(mm0, mm1);
        movq_r2r(mm2, mm3);
        punpcklwd_r2r(mm7, mm0);
        punpckhwd_r2r(mm7, mm1);
        punpcklwd_r2r(mm7, mm2);
        punpckhwd_r2r(mm7, mm3);
        cvtpi2ps_r2r(mm0,xmm0);
        cvtpi2ps_r2r(mm1,xmm1);
        cvtpi2ps_r2r(mm2,xmm2);
        cvtpi2ps_r2r(mm3,xmm3);
        movlps_r2m(xmm0,dst[0]);
        movlps_r2m(xmm1,dst[2]);
        movlps_r2m(xmm2,dst[4]);
        movlps_r2m(xmm3,dst[6]);

        src+=8;
        dst+=8;
    }
    emms();
}
Exemple #7
0
static char *status_get_ETA(char *percent, unsigned int secs_done)
{
	static char s_ETA[128];
	char *cp;
	double sec_left, percent_left;
	time_t t_ETA;
	struct tm *pTm;

	emms();

	/* Compute the ETA for this run.  Assumes even run time for
	   work currently done and work left to do, and that the CPU
	   utilization of work done and work to do will stay same
	   which may not always be valid assumptions */
	cp = percent;
	while (cp && *cp && isspace(((unsigned char)(*cp))))
		++cp;
	if (!cp || *cp == 0 || !isdigit(((unsigned char)(*cp))))
		return "";  /* dont show ETA if no valid percentage. */
	else
	{
		double chk;
		percent_left = atof(percent);
		t_ETA = time(NULL);
		if (percent_left >= 100.0) {
			pTm = localtime(&t_ETA);
			strcpy(s_ETA, " (");
			strftime(&s_ETA[2], sizeof(s_ETA)-3, timeFmt, pTm);
			strcat(s_ETA, ")");
			return s_ETA;
		}
		if (percent_left == 0 || percent_left < ETAthreshold)
			return "";  /* mute ETA if too little progress */
		percent_left /= 100;
		sec_left = secs_done;
		sec_left /= percent_left;
		sec_left -= secs_done;
		/* Note, many localtime() will fault if given a time_t
		   later than Jan 19, 2038 (i.e. 0x7FFFFFFFF). We
		   check for that here, and if so, this run will
		   not end anyway, so simply tell user to not hold
		   her breath */
		chk = sec_left;
		chk += t_ETA;
		if (chk > 0x7FFFF000) { /* slightly less than 'max' 32 bit time_t, for safety */
			strcpy(s_ETA, " (ETA: never)");
			return s_ETA;
		}
		t_ETA += sec_left;
		pTm = localtime(&t_ETA);
		strcpy(s_ETA, " (ETA: ");
		if (sec_left < 24 * 3600)
			strftime(&s_ETA[7], sizeof(s_ETA)-10, timeFmt24, pTm);
		else
			strftime(&s_ETA[7], sizeof(s_ETA)-10, timeFmt, pTm);
		strcat(s_ETA, ")");
	}
	return s_ETA;
}
Exemple #8
0
EAPI void
evas_common_cpu_end_opt(void)
{
   if (cpu_feature_mask & (CPU_FEATURE_MMX | CPU_FEATURE_MMX2))
     {
	emms();
     }
}
Exemple #9
0
void test_motion(const char *name,
                 motion_func *test_func, motion_func *ref_func)
{
    int x, y, d1, d2, it;
    uint8_t *ptr;
    int64_t ti;
    printf("testing '%s'\n", name);

    /* test correctness */
    for(it=0;it<20;it++) {

        fill_random(img1, WIDTH * HEIGHT);
        fill_random(img2, WIDTH * HEIGHT);
        
        for(y=0;y<HEIGHT-17;y++) {
            for(x=0;x<WIDTH-17;x++) {
                ptr = img2 + y * WIDTH + x; 
                d1 = test_func(img1, ptr, WIDTH);
                d2 = ref_func(img1, ptr, WIDTH);
                if (d1 != d2) {
                    printf("error: mmx=%d c=%d\n", d1, d2);
                }
            }
        }
    }
    emms();
    
    /* speed test */
    ti = gettime();
    d1 = 0;
    for(it=0;it<NB_ITS;it++) {
        for(y=0;y<HEIGHT-17;y++) {
            for(x=0;x<WIDTH-17;x++) {
                ptr = img2 + y * WIDTH + x; 
                d1 += test_func(img1, ptr, WIDTH);
            }
        }
    }
    emms();
    dummy = d1; /* avoid optimisation */
    ti = gettime() - ti;
    
    printf("  %0.0f kop/s\n", 
           (double)NB_ITS * (WIDTH - 16) * (HEIGHT - 16) / 
           (double)(ti / 1000.0));
}
Exemple #10
0
static double get_progress(void)
{
	// This is a dummy function just for getting the DONE
	// timestamp from status.c - it will return -1 all
	// the time except when a mode is finished
	emms();

	return progress;
}
Exemple #11
0
void _dv_dct_88(dv_coeff_t *block) {
#if ((!ARCH_X86) && (!ARCH_X86_64))
#if BRUTE_FORCE_DCT_88
  int v,h,y,x,i;
  double temp[64];
  int factor = pow(2, DCT_YUV_PRECISION);

  memset(temp,0,sizeof(temp));
  for (v = 0; v < 8; v++) {
    for (h = 0; h < 8; h++) {
      for (y = 0;y < 8; y++) {
	for (x = 0;x < 8; x++) {
	  temp[v * 8 + h] += block[x * 8 + y] 
	    * KC88[x][y][h][v];
	}
      }
      temp[v * 8 + h] *= (C[h] * C[v]);
    }
  }

  for (i = 0; i < 64; i++) {
    block[i] = temp[i] / factor;
  }
#else /* BRUTE_FORCE_DCT_88 */
  dct88_aan(block);
  postscale88(block);
#endif /* BRUTE_FORCE_DCT_88 */
#elif ARCH_X86_64
  _dv_dct_88_block_mmx_x86_64(block);
  _dv_transpose_mmx_x86_64(block);
  _dv_dct_88_block_mmx_x86_64(block);
  _dv_dct_block_mmx_x86_64_postscale_88(block, postSC88);
  emms(); 
#else /* ((!ARCH_X86) && (!ARCH_X86_64)) */
  _dv_dct_88_block_mmx(block);
  _dv_transpose_mmx(block);
  _dv_dct_88_block_mmx(block);
  _dv_dct_block_mmx_postscale_88(block, postSC88);
  emms(); 
#endif /* ((!ARCH_X86) && (!ARCH_X86_64)) */
}
	void RegisterAllocator::spillMMXcept(const OperandMMREG &r64)
	{
		for(int i = 0; i < 8; i++)
		{
			if(r64.reg != i)
			{
				spill64(i);
			}
		}

		emms();
	}
Exemple #13
0
static void
deinterlace_scanline_linear_mmxext (GstDeinterlaceMethod * self,
    GstDeinterlace * parent, guint8 * out,
    GstDeinterlaceScanlineData * scanlines, gint width)
{
  gint i;
  guint8 *bot = scanlines->b0, *top = scanlines->t0;

  for (i = width / 16; i; --i) {
    movq_m2r (*bot, mm0);
    movq_m2r (*top, mm1);
    movq_m2r (*(bot + 8), mm2);
    movq_m2r (*(top + 8), mm3);
    movq_m2r (*(bot + 16), mm4);
    movq_m2r (*(top + 16), mm5);
    movq_m2r (*(bot + 24), mm6);
    movq_m2r (*(top + 24), mm7);
    pavgb_r2r (mm1, mm0);
    pavgb_r2r (mm3, mm2);
    pavgb_r2r (mm5, mm4);
    pavgb_r2r (mm7, mm6);
    movntq_r2m (mm0, *out);
    movntq_r2m (mm2, *(out + 8));
    movntq_r2m (mm4, *(out + 16));
    movntq_r2m (mm6, *(out + 24));
    out += 32;
    top += 32;
    bot += 32;
  }
  width = (width & 0xf);

  for (i = width / 4; i; --i) {
    movq_m2r (*bot, mm0);
    movq_m2r (*top, mm1);
    pavgb_r2r (mm1, mm0);
    movntq_r2m (mm0, *out);
    out += 8;
    top += 8;
    bot += 8;
  }
  width = width & 0x7;

  /* Handle last few pixels. */
  for (i = width * 2; i; --i) {
    *out++ = ((*top++) + (*bot++)) >> 1;
  }

  emms ();
}
int SIMD_SUFFIX(mblocks_sub44_mests)( uint8_t *blk,  uint8_t *ref,
					int ilow,int jlow,
					int ihigh, int jhigh, 
					int h, int rowstride, 
					int threshold,
					me_result_s *resvec)
{
	int32_t x,y;
	uint8_t *currowblk = blk;
	uint8_t *curblk;
	me_result_s *cres = resvec;
	int      gridrowstride = rowstride;
	int weight;

        SIMD_SUFFIX(init_qblock_sad)(ref, h, rowstride);
	for( y=jlow; y <= jhigh ; y+=4)
	{
		curblk = currowblk;
		// You'd think prefetching curblk+4*rowstride would help here.
		// I have found *NO* measurable increase in performance...
		
		for( x = ilow; x <= ihigh; x += 4)
		{
			if( (x & 15) == (ilow & 15) )
			{
				load_blk( curblk, rowstride, h );
                                curblk += 4;
			}
			weight = SIMD_SUFFIX(qblock_sad)(ref, h, rowstride);
			shift_blk(8);
			if( weight <= threshold )
			{
				threshold = intmin(weight<<2,threshold);
				/* Rough and-ready absolute distance penalty */
				/* NOTE: This penalty is *vital* to correct operation 
				   as otherwise the sub-mean filtering won't work on very
				   uniform images.
				 */
				cres->weight = (uint16_t)(weight+(intmax(abs(x),abs(y))<<2));
				cres->x = (uint8_t)x;
				cres->y = (uint8_t)y;
				++cres;
			}
		}
		currowblk += gridrowstride;
	}
	emms();
	return cres - resvec;
}
Exemple #15
0
static void
deinterlace_line_mmx (uint8_t * dst, uint8_t * lum_m4,
    uint8_t * lum_m3, uint8_t * lum_m2,
    uint8_t * lum_m1, uint8_t * lum, int size)
{
  mmx_t rounder;

  rounder.uw[0] = 4;
  rounder.uw[1] = 4;
  rounder.uw[2] = 4;
  rounder.uw[3] = 4;
  pxor_r2r (mm7, mm7);
  movq_m2r (rounder, mm6);

  for (; size > 3; size -= 4) {
    movd_m2r (*lum_m4, mm0);
    movd_m2r (*lum_m3, mm1);
    movd_m2r (*lum_m2, mm2);
    movd_m2r (*lum_m1, mm3);
    movd_m2r (*lum, mm4);
    punpcklbw_r2r (mm7, mm0);
    punpcklbw_r2r (mm7, mm1);
    punpcklbw_r2r (mm7, mm2);
    punpcklbw_r2r (mm7, mm3);
    punpcklbw_r2r (mm7, mm4);
    paddw_r2r (mm3, mm1);
    psllw_i2r (1, mm2);
    paddw_r2r (mm4, mm0);
    psllw_i2r (2, mm1);         // 2
    paddw_r2r (mm6, mm2);
    paddw_r2r (mm2, mm1);
    psubusw_r2r (mm0, mm1);
    psrlw_i2r (3, mm1);         // 3
    packuswb_r2r (mm7, mm1);
    movd_r2m (mm1, *dst);
    lum_m4 += 4;
    lum_m3 += 4;
    lum_m2 += 4;
    lum_m1 += 4;
    lum += 4;
    dst += 4;
  }
  emms ();

  /* Handle odd widths */
  if (size > 0)
    deinterlace_line_c (dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size);
}
Exemple #16
0
static void yuy2_to_yv12_mmxext
(const unsigned char *yuy2_map, int yuy2_pitch,
 unsigned char *y_dst, int y_dst_pitch, 
 unsigned char *u_dst, int u_dst_pitch, 
 unsigned char *v_dst, int v_dst_pitch, 
 int width, int height) 
{
#if HAVE_MMX
    const uint8_t *p_line1, *p_line2 = yuy2_map;
    uint8_t *p_y1, *p_y2 = y_dst;
    uint8_t *p_u = u_dst;
    uint8_t *p_v = v_dst;

    int i_x, i_y;

    const int i_dest_margin = y_dst_pitch - width;
    const int i_dest_u_margin = u_dst_pitch - width/2;
    const int i_dest_v_margin = v_dst_pitch - width/2;
    const int i_source_margin = yuy2_pitch - width*2;

    __asm__ __volatile__(
            "pcmpeqw %mm7, %mm7           \n\t"
            "psrlw $8, %mm7               \n\t" /* 00 ff 00 ff 00 ff 00 ff */
            );

    for ( i_y = height / 2 ; i_y-- ; )
    {
        p_line1 = p_line2;
        p_line2 += yuy2_pitch;

        p_y1 = p_y2;
        p_y2 += y_dst_pitch;

        for ( i_x = width / 8 ; i_x-- ; )
        {
            MMXEXT_YUYV_YUV420( );
        }

        p_y2 += i_dest_margin;
        p_u += i_dest_u_margin;
        p_v += i_dest_v_margin;
        p_line2 += i_source_margin;
    }

    sfence();
    emms();
#endif
}
Exemple #17
0
static PyObject* Codec_Encode( PyCodecObject* obj, PyObject *args)
{
//#ifndef WIN32
	PyVFrameObject* cFrame = NULL;
	PyObject* cRes = NULL;
	int iLen = 0;
	AVFrame picture;

#define ENCODE_OUTBUF_SIZE 300000

	char sOutbuf[ ENCODE_OUTBUF_SIZE ];
	if (!PyArg_ParseTuple(args, "O!", &VFrameType, &cFrame ))
		return NULL;

	if (!(obj->cCodec ||obj->cCodec->codec))
	{
		PyErr_SetString(g_cErr, "Encode error:codec not initialized" );
		return NULL;
	}

	//reset codec parameters if frame size is  smaller than codec frame size
	if (!obj->cCodec->width || ! obj->cCodec->height)
	{
		PyErr_SetString(g_cErr, "Encode: zero frame size set in codec" );
		return NULL;
	}

	if ((obj->cCodec->width > frame_get_width(cFrame)) ||
	   (obj->cCodec->height > frame_get_height(cFrame)) )
	{
		PyErr_SetString(g_cErr, "Encode: cannot change resolution for frame. Use scaling first..." );
		return NULL;
	}

	/* check codec params */
	PyVFrame2AVFrame(cFrame, &picture, 1 );
	iLen = avcodec_encode_video(obj->cCodec, sOutbuf, ENCODE_OUTBUF_SIZE,	&picture);
	if (iLen > 0)
		cRes= Frame_New_LAVC_Enc( obj, sOutbuf, iLen );
	else
		PyErr_Format(g_cErr, "Failed to encode frame( error code is %d )", iLen );

#ifdef HAVE_MMX
    emms();
#endif
 	return cRes;
}
Exemple #18
0
static void interpolate_packed422_scanline_mmxext( uint8_t *output, uint8_t *top,
                                                   uint8_t *bot, int width )
{
    int i;

    for( i = width/16; i; --i ) {
        movq_m2r( *bot, mm0 );
        movq_m2r( *top, mm1 );
        movq_m2r( *(bot + 8), mm2 );
        movq_m2r( *(top + 8), mm3 );
        movq_m2r( *(bot + 16), mm4 );
        movq_m2r( *(top + 16), mm5 );
        movq_m2r( *(bot + 24), mm6 );
        movq_m2r( *(top + 24), mm7 );
        pavgb_r2r( mm1, mm0 );
        pavgb_r2r( mm3, mm2 );
        pavgb_r2r( mm5, mm4 );
        pavgb_r2r( mm7, mm6 );
        movntq_r2m( mm0, *output );
        movntq_r2m( mm2, *(output + 8) );
        movntq_r2m( mm4, *(output + 16) );
        movntq_r2m( mm6, *(output + 24) );
        output += 32;
        top += 32;
        bot += 32;
    }
    width = (width & 0xf);

    for( i = width/4; i; --i ) {
        movq_m2r( *bot, mm0 );
        movq_m2r( *top, mm1 );
        pavgb_r2r( mm1, mm0 );
        movntq_r2m( mm0, *output );
        output += 8;
        top += 8;
        bot += 8;
    }
    width = width & 0x7;

    /* Handle last few pixels. */
    for( i = width * 2; i; --i ) {
        *output++ = ((*top++) + (*bot++)) >> 1;
    }

    sfence();
    emms();
}
Exemple #19
0
static void
deinterlace_scanline_linear_mmxext (GstDeinterlaceSimpleMethod * self,
    guint8 * out, const guint8 * bot, const guint8 * top, gint size)
{
  gint i;

  for (i = size / 32; i; --i) {
    movq_m2r (*bot, mm0);
    movq_m2r (*top, mm1);
    movq_m2r (*(bot + 8), mm2);
    movq_m2r (*(top + 8), mm3);
    movq_m2r (*(bot + 16), mm4);
    movq_m2r (*(top + 16), mm5);
    movq_m2r (*(bot + 24), mm6);
    movq_m2r (*(top + 24), mm7);
    pavgb_r2r (mm1, mm0);
    pavgb_r2r (mm3, mm2);
    pavgb_r2r (mm5, mm4);
    pavgb_r2r (mm7, mm6);
    movntq_r2m (mm0, *out);
    movntq_r2m (mm2, *(out + 8));
    movntq_r2m (mm4, *(out + 16));
    movntq_r2m (mm6, *(out + 24));
    out += 32;
    top += 32;
    bot += 32;
  }
  size = (size & 0x1f);

  for (i = size / 8; i; --i) {
    movq_m2r (*bot, mm0);
    movq_m2r (*top, mm1);
    pavgb_r2r (mm1, mm0);
    movntq_r2m (mm0, *out);
    out += 8;
    top += 8;
    bot += 8;
  }
  emms ();

  size = size & 0xf;

  /* Handle last few pixels. */
  for (i = size; i; --i) {
    *out++ = ((*top++) + (*bot++)) >> 1;
  }
}
Exemple #20
0
void _dv_dct_248(dv_coeff_t *block) 
{
#if ((!ARCH_X86) && (!ARCH_X86_64))
#if BRUTE_FORCE_DCT_248
  int u,h,z,x,i;
  double temp[64];
  int factor = pow(2, DCT_YUV_PRECISION);
	
  memset(temp,0,sizeof(temp));
  for (u=0;u<4;u++) {
    for (h=0;h<8;h++) {
      for (z=0;z<4;z++) {
	for (x=0;x<8;x++) {
	  temp[u*8+h] +=     (block[x*8+2*z] + block[x*8+(2*z+1)]) *
		  KC248[x][z][u][h];
	  temp[(u+4)*8+h] += (block[x*8+2*z] - block[x*8+(2*z+1)]) *
	    KC248[x][z][u][h];
	}
      }
      temp[u*8+h] *= (C[h] * C[u]);
      temp[(u+4)*8+h] *= (C[h] * C[u]);
    }
  }

  for (i=0;i<64;i++)
	  block[i] = temp[i] / factor;
#else /* BRUTE_FORCE_DCT_248 */
  dct248_aan(block);
  postscale248(block);
#endif /* BRUTE_FORCE_DCT_248 */
#elif ARCH_X86_64
  _dv_dct_88_block_mmx_x86_64(block);
  _dv_transpose_mmx_x86_64(block);
  _dv_dct_248_block_mmx_x86_64(block);
  _dv_dct_248_block_mmx_x86_64_post_sum(block);
  _dv_dct_block_mmx_x86_64_postscale_248(block, postSC248);
#else /* ((!ARCH_X86) && (!ARCH_X86_64)) */
  _dv_dct_88_block_mmx(block);
  _dv_transpose_mmx(block);
  _dv_dct_248_block_mmx(block);
  _dv_dct_248_block_mmx_post_sum(block);
  _dv_dct_block_mmx_postscale_248(block, postSC248);
  emms();
#endif /* ((!ARCH_X86) && (!ARCH_X86_64)) */
}
Exemple #21
0
static inline void mean8(unsigned char *refpix,unsigned char *pixel,int radius_count,int row_stride,int threshold,int8_t *diff,unsigned char *count)
{
    int a,b;

    pxor_r2r(mm6,mm6); // mm6 (aka count) = 0
    pxor_r2r(mm7,mm7); // mm7 (aka diff) = 0
    movq_m2r(*refpix,mm3); // mm3 = refpix[0]

    movd_g2r(0x80808080,mm4); // mm4 = 128
    punpcklbw_r2r(mm4,mm4);

    pxor_r2r(mm4,mm3); // mm3 = refpix[0]-128

    movd_g2r(threshold,mm5); // mm5 = threshold
    punpcklbw_r2r(mm5,mm5);
    punpcklbw_r2r(mm5,mm5);
    punpcklbw_r2r(mm5,mm5);

    for( b=0; b<radius_count; b++ ) {
        for( a=0; a<radius_count; a++ ) {
            movq_m2r(*pixel,mm0); // mm0  = pixel[0]
            pxor_r2r(mm4,mm0);    // mm0  = pixel[0]-128
            movq_r2r(mm3,mm2);    // mm2  = refpix[0]-128
            psubsb_r2r(mm0,mm2);  // mm2  = refpix[0]-pixel[0]
            psubsb_r2r(mm3,mm0);  // mm0  = pixel[0]-refpix[0]
            pminub_r2r(mm0,mm2);  // mm2  = abs(pixel[0]-refpix[0])
            movq_r2r(mm5,mm1);    // mm1  = threshold
            pcmpgtb_r2r(mm2,mm1); // mm1  = (threshold > abs(pixel[0]-refpix[0])) ? -1 : 0
            psubb_r2r(mm1,mm6);   // mm6 += (threshold > abs(pixel[0]-refpix[0]))
            pand_r2r(mm1,mm0);    // mm0  = (threshold > abs(pixel[0]-refpix[0])) ? pixel[0]-refpix[0] : 0
            paddb_r2r(mm0,mm7);   // mm7 += (threshold > abs(pixel[0]-refpix[0])) ? pixel[0]-refpix[0] : 0

            ++pixel;
        }
        pixel += row_stride - radius_count;
    }

    movq_r2m(mm6,*count);
    movq_r2m(mm7,*diff);

    emms();
 
}
Exemple #22
0
static void fast_memcpy_mmxext( void *d, const void *s, size_t n )
{
    const uint8_t *src = s;
    uint8_t *dest = d;

    if( dest != src ) {
        while( n > 64 ) {
            movq_m2r( src[ 0 ], mm0 );
            movq_m2r( src[ 8 ], mm1 );
            movq_m2r( src[ 16 ], mm2 );
            movq_m2r( src[ 24 ], mm3 );
            movq_m2r( src[ 32 ], mm4 );
            movq_m2r( src[ 40 ], mm5 );
            movq_m2r( src[ 48 ], mm6 );
            movq_m2r( src[ 56 ], mm7 );
            movntq_r2m( mm0, dest[ 0 ] );
            movntq_r2m( mm1, dest[ 8 ] );
            movntq_r2m( mm2, dest[ 16 ] );
            movntq_r2m( mm3, dest[ 24 ] );
            movntq_r2m( mm4, dest[ 32 ] );
            movntq_r2m( mm5, dest[ 40 ] );
            movntq_r2m( mm6, dest[ 48 ] );
            movntq_r2m( mm7, dest[ 56 ] );
            dest += 64;
            src += 64;
            n -= 64;
        }

        while( n > 8 ) {
            movq_m2r( src[ 0 ], mm0 );
            movntq_r2m( mm0, dest[ 0 ] );
            dest += 8;
            src += 8;
            n -= 8;
        }

        if( n ) small_memcpy( dest, src, n );

        sfence();
        emms();
    }
}
	void RegisterAllocator::free64(int i)
	{
		bool free = (MMX[i].priority != 0);

		if(MMX[i].loadInstruction && loadElimination)
		{
			MMX[i].loadInstruction->reserve();
			MMX[i].loadInstruction = 0;
		}

		if(MMX[i].copyInstruction && copyPropagation)
		{
			MMX[i].copyInstruction->reserve();
			MMX[i].copyInstruction = 0;
		}

		MMX[i].reference = 0;
		MMX[i].partial = 0;
		MMX[i].priority = 0;

		if(free && autoEMMS)
		{
			for(int i = 0; i < 8; i++)
			{
				if(MMX[i].priority != 0)
				{
					return;
				}
			}

			// Last one freed
			emms();

			// Completely eraze MMX allocation state
			for(int i = 0; i < 8; i++)
			{
				MMX[i].free();
			}
		}
	}
static void frame_f2i_sse(float *src,u_char *dst,int l)
{
    int i;

    // put 128 in all 4 words of mm7
    movd_g2r(128,mm7);
    punpcklwd_r2r(mm7,mm7);
    punpckldq_r2r(mm7,mm7);

    // put 128 in all 8 bytes of mm6
    movd_g2r(128,mm6);
    punpcklbw_r2r(mm6,mm6);
    punpcklwd_r2r(mm6,mm6);
    punpckldq_r2r(mm6,mm6);

    for( i=0; i<l; i+=8 ) {
        movaps_m2r(src[0],xmm0);
        movaps_m2r(src[4],xmm2);
        movhlps_r2r(xmm0,xmm1);
        cvtps2pi_r2r(xmm0,mm0);
        cvtps2pi_r2r(xmm1,mm1);
        movhlps_r2r(xmm2,xmm3);
        cvtps2pi_r2r(xmm2,mm2);
        cvtps2pi_r2r(xmm3,mm3);
        packssdw_r2r(mm1,mm0);
        packssdw_r2r(mm3,mm2);
        psubw_r2r(mm7,mm0);
        psubw_r2r(mm7,mm2);
        packsswb_r2r(mm2, mm0);
        paddb_r2r(mm6, mm0);
        movq_r2m(mm0,dst[0]);

        src+=8;
        dst+=8;
    }

    emms();
}
Exemple #25
0
static void status_print_cracking(char *percent)
{
	unsigned int time = status_get_time();
	char *key, saved_key[PLAINTEXT_BUFFER_SIZE] = "";
	char s_cps[64], cand[32] = "";

	emms();

	if (!(options.flags & FLG_STATUS_CHK))
		if ((key = crk_get_key2()))
			strnzcpy(saved_key, key, PLAINTEXT_BUFFER_SIZE);

	if (showcand)
		sprintf(cand, "/%.0f", (double)((long long)status.crypts.hi << 32) + status.crypts.lo);

#ifdef HAVE_MPI
	// we need to print until cr in one call, otherwise output gets interleaved
	char nodeid[11] = "";
	if (mpi_p > 1)
		snprintf(nodeid, sizeof(nodeid), "%3d: ", mpi_id);
	nodeid[sizeof(nodeid)-1] = 0;
	char trying[256];
	if ((options.flags & FLG_STATUS_CHK) ||
	    !(status.crypts.lo | status.crypts.hi))
		trying[0] = 0;
	else {
		UTF8 t1buf[PLAINTEXT_BUFFER_SIZE + 1];
		UTF8 t2buf[PLAINTEXT_BUFFER_SIZE + 1];
		char *t1, *t2;
		if (options.report_utf8 && !options.utf8) {
			t1 = (char*)enc_to_utf8_r(crk_get_key1(), t1buf, PLAINTEXT_BUFFER_SIZE);
			t2 = (char*)enc_to_utf8_r(saved_key, t2buf, PLAINTEXT_BUFFER_SIZE);
		} else {
			t1 = crk_get_key1();
			t2 = saved_key;
		}
		snprintf(trying, sizeof(trying),
		         "%strying: %s%s%s",
		         mpi_p > 1 ? " " : "  ",
		         t1, t2[0] ? " - " : "", t2);
	}

	fprintf(stderr,
	        "%s"
	        "guesses: %u%s%s"
	        "time: %u:%02u:%02u:%02u"
	        "%s%s%s"
	        "c/s: %s"
	        "%s\n",
	        nodeid,
	        status.guess_count, cand,
	        mpi_p > 1 ? " " : "  ",
	        time / 86400, time % 86400 / 3600, time % 3600 / 60, time % 60,
	        strncmp(percent, " 100", 4) ? percent : " DONE",
	        status_get_ETA(percent,time),
	        mpi_p > 1 ? " " : "  ",
	        status_get_cps(s_cps),
	        trying);
#else
	fprintf(stderr,
		"guesses: %u%s  "
		"time: %u:%02u:%02u:%02u"
		"%s%s  "
		"c/s: %s",
		status.guess_count, cand,
		time / 86400, time % 86400 / 3600, time % 3600 / 60, time % 60,
		strncmp(percent, " 100", 4) ? percent : " DONE",
		status_get_ETA(percent,time),
		status_get_cps(s_cps));

	if ((options.flags & FLG_STATUS_CHK) ||
	    !(status.crypts.lo | status.crypts.hi))
		fputc('\n', stderr);
	else {
		UTF8 t1buf[PLAINTEXT_BUFFER_SIZE + 1];
		UTF8 t2buf[PLAINTEXT_BUFFER_SIZE + 1];
		char *t1, *t2;
		if (options.report_utf8 && !options.utf8) {
			t1 = (char*)enc_to_utf8_r(crk_get_key1(), t1buf, PLAINTEXT_BUFFER_SIZE);
			t2 = (char*)enc_to_utf8_r(saved_key, t2buf, PLAINTEXT_BUFFER_SIZE);
		} else {
			t1 = crk_get_key1();
			t2 = saved_key;
		}
		fprintf(stderr,	"  trying: %s%s%s\n",
		        t1, t2[0] ? " - " : "", t2);
	}
#endif
}
Exemple #26
0
static char *status_get_totalETA(char *percent, unsigned int secs_done)
{
	static char s_ETA[128];
	char *cp;
	double sec_left, percent_left, max_sec_left;
	time_t t_ETA;
	struct tm *pTm;

	emms();

	cp = percent;
	while (cp && *cp && isspace(*cp))
		++cp;
	if (!cp || *cp == 0 || !isdigit(*cp)) {
		// We must report to MPI_Allreduce anyway
		sec_left = 0;
		MPI_Allreduce(&sec_left, &max_sec_left, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
		return "";  /* dont show ETA if no valid percentage. */
	}
	else
	{
		double chk;
		percent_left = atof(percent);
		t_ETA = time(NULL);
		if (percent_left >= 100.0) {
			// We must report to MPI_Allreduce anyway
			sec_left = 0;
			MPI_Allreduce(&sec_left, &max_sec_left, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
			pTm = localtime(&t_ETA);
			strcpy(s_ETA, " (");
			strftime(&s_ETA[2], sizeof(s_ETA)-3, timeformat, pTm);
			strcat(s_ETA, ")");
			return s_ETA;
		}
		if (percent_left == 0 || percent_left < ETAthreshold) {
			// We must report to MPI_Allreduce anyway
			sec_left = 0;
			MPI_Allreduce(&sec_left, &max_sec_left, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
			return "";  /* mute ETA if too little progress */
		}
		percent_left /= 100;
		sec_left = secs_done;
		sec_left /= percent_left;
		sec_left -= secs_done;
		// Reports the worst ETA for all nodes
		MPI_Allreduce(&sec_left, &max_sec_left, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
		sec_left = max_sec_left;

		chk = sec_left;
		chk += t_ETA;
		if (chk > 0x7FFFF000) { /* slightly less than 'max' 32 bit time_t, for safety */
			strcpy(s_ETA, " (ETA: never)");
			return s_ETA;
		}
		t_ETA += sec_left;
		pTm = localtime(&t_ETA);
		strcpy(s_ETA, " (ETA: ");
		strftime(&s_ETA[7], sizeof(s_ETA)-10, timeformat, pTm);
		strcat(s_ETA, ")");
	}
	return s_ETA;
}
Exemple #27
0
static void
deinterlace_scanline_linear_mmx (GstDeinterlaceSimpleMethod * self,
    guint8 * out, const guint8 * bot, const guint8 * top, gint size)
{
  const mmx_t shiftmask = { 0xfefffefffefffeffULL };    /* To avoid shifting chroma to luma. */
  int i;

  for (i = size / 32; i; --i) {
    movq_m2r (*bot, mm0);
    movq_m2r (*top, mm1);
    movq_m2r (*(bot + 8), mm2);
    movq_m2r (*(top + 8), mm3);
    movq_m2r (*(bot + 16), mm4);
    movq_m2r (*(top + 16), mm5);
    movq_m2r (*(bot + 24), mm6);
    movq_m2r (*(top + 24), mm7);
    pand_m2r (shiftmask, mm0);
    pand_m2r (shiftmask, mm1);
    pand_m2r (shiftmask, mm2);
    pand_m2r (shiftmask, mm3);
    pand_m2r (shiftmask, mm4);
    pand_m2r (shiftmask, mm5);
    pand_m2r (shiftmask, mm6);
    pand_m2r (shiftmask, mm7);
    psrlw_i2r (1, mm0);
    psrlw_i2r (1, mm1);
    psrlw_i2r (1, mm2);
    psrlw_i2r (1, mm3);
    psrlw_i2r (1, mm4);
    psrlw_i2r (1, mm5);
    psrlw_i2r (1, mm6);
    psrlw_i2r (1, mm7);
    paddb_r2r (mm1, mm0);
    paddb_r2r (mm3, mm2);
    paddb_r2r (mm5, mm4);
    paddb_r2r (mm7, mm6);
    movq_r2m (mm0, *out);
    movq_r2m (mm2, *(out + 8));
    movq_r2m (mm4, *(out + 16));
    movq_r2m (mm6, *(out + 24));
    out += 32;
    top += 32;
    bot += 32;
  }
  size = (size & 0x1f);

  for (i = size / 8; i; --i) {
    movq_m2r (*bot, mm0);
    movq_m2r (*top, mm1);
    pand_m2r (shiftmask, mm0);
    pand_m2r (shiftmask, mm1);
    psrlw_i2r (1, mm0);
    psrlw_i2r (1, mm1);
    paddb_r2r (mm1, mm0);
    movq_r2m (mm0, *out);
    out += 8;
    top += 8;
    bot += 8;
  }
  emms ();

  size = size & 0xf;

  /* Handle last few pixels. */
  for (i = size; i; --i) {
    *out++ = ((*top++) + (*bot++)) >> 1;
  }
}
Exemple #28
0
int main()
{
	int rval;
	mmx_t ma;
	mmx_t mb;

	movq_r2r(mm0, mm1);

	rval = mmx_ok();

	/* Announce return value of mmx_ok() */
//	printf("Value returned from init was %x.", rval);
//	printf(" (Indicates MMX %s available)\n\n",(rval)? "is" : "not");
//	fflush(stdout); fflush(stderr);

//	if(rval)
	{
		/* PADD *****************************************************/
		ma.q = 0x1111111180000000LL;
		mb.q = 0x7fffffff00000001LL;
		paddd(ma, mb);
		fprintf(stdout, "paddd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddd: mb.q is 9111111080000001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010001LL;
		mb.q = 0x80007fffffff0001LL;
		paddw(ma, mb);
		fprintf(stdout, "paddw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddw: mb.q is 8001800000000002\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010000LL;
		paddw(ma, mb);
		fprintf(stdout, "paddw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddw: mb.q is 8001800000000001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x01010101807fff01LL;
		mb.q = 0x807fff0101010101LL;
		paddb(ma, mb);
		fprintf(stdout, "paddb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddb: mb.q is 8180000281800002\n");
		fflush(stdout); fflush(stderr);


		/* PADDS ****************************************************/
		ma.q = 0x0001000100010001LL;
		mb.q = 0x80007fffffff0001LL;
		paddsw(ma, mb);
		fprintf(stdout, "paddsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddsw: mb.q is 80017fff00000002\n");

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010000LL;
		paddsw(ma, mb);
		fprintf(stdout, "paddsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddsw: mb.q is 80017fff00000001\n");

		ma.q = 0x01010101807fff01LL;
		mb.q = 0x807fff0101010101LL;
		paddsb(ma, mb);
		fprintf(stdout, "paddsb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddsb: mb.q is 817f0002817f0002\n");
		fflush(stdout); fflush(stderr);


		/* PADDUS ***************************************************/
		ma.q = 0x0001000100010001LL;
		mb.q = 0x80007fffffff0001LL;
		paddusw(ma, mb);
		fprintf(stdout, "paddusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddusw: mb.q is 80018000ffff0002\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010000LL;
		paddusw(ma, mb);
		fprintf(stdout, "paddusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddusw: mb.q is 80018000ffff0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x01010101807fff01LL;
		mb.q = 0x807fff0101010101LL;
		paddusb(ma, mb);
		fprintf(stdout, "paddusb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddusb: mb.q is 8180ff028180ff02\n");
		fflush(stdout); fflush(stderr);


		/* PSUB *****************************************************/
		ma.q = 0x7fffffff00000001LL;
		mb.q = 0x1111111180000000LL;
		psubd(ma, mb);
		fprintf(stdout, "psubd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubd: mb.q is 911111127fffffff\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010001LL;
		psubw(ma, mb);
		fprintf(stdout, "psubw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubw: mb.q is 8001800200020000\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010000LL;
		mb.q = 0x80007fffffff0001LL;
		psubw(ma, mb);
		fprintf(stdout, "psubw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubw: mb.q is 7fff7ffefffe0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x807fff0101010101LL;
		mb.q = 0x01010101807fff01LL;
		psubb(ma, mb);
		fprintf(stdout, "psubb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubb: mb.q is 818202007f7efe00\n");
		fflush(stdout); fflush(stderr);


		/* PSUBS ****************************************************/
		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010001LL;
		psubsw(ma, mb);
		fprintf(stdout, "psubsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubsw: mb.q is 7fff800200020000\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010000LL;
		mb.q = 0x80007fffffff0001LL;
		psubsw(ma, mb);
		fprintf(stdout, "psubsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubsw: mb.q is 80007ffefffe0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x807fff0101010101LL;
		mb.q = 0x01010101807fff01LL;
		psubsb(ma, mb);
		fprintf(stdout, "psubsb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubsb: mb.q is 7f820200807efe00\n");
		fflush(stdout); fflush(stderr);
 

		/* PSUBUS ***************************************************/
		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010001LL;
		psubusw(ma, mb);
		fprintf(stdout, "psubusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubusw: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010000LL;
		mb.q = 0x80007fffffff0001LL;
		psubusw(ma, mb);
		fprintf(stdout, "psubusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubusw: mb.q is 7fff7ffefffe0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x807fff0101010101LL;
		mb.q = 0x01010101807fff01LL;
		psubusb(ma, mb);
		fprintf(stdout, "psubusb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubusb: mb.q is 000000007f7efe00\n");
		fflush(stdout); fflush(stderr);


		/* PMUL *****************************************************/
		ma.q = 0x8000ffff00ff0000LL;
		mb.q = 0x0200ffff00ffffffLL;
		pmulhw(ma, mb);
		fprintf(stdout, "pmulhw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pmulhw: mb.q is ff00000000000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0200ffff00ffffffLL;
		pmullw(ma, mb);
		fprintf(stdout, "pmullw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pmullw: mb.q is 00000001fe010000\n");
		fflush(stdout); fflush(stderr);


		/* PMADD ****************************************************/
		ma.q = 0x8000345680007f34LL;
		mb.q = 0x93234a27ffff1707LL;

		pmaddwd(ma, mb);
		fprintf(stdout, "pmaddwd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pmaddwd: mb.q is 4597551a0b71a66c\n");
		fflush(stdout); fflush(stderr);


		/* PCMPEQ ***************************************************/
		ma.q = 0x800034568f237f34LL;
		mb.q = 0x93009a568f237f34LL;

		pcmpeqd(ma, mb);
		fprintf(stdout, "pcmpeqd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpeqd: mb.q is 00000000ffffffff\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x93009a568f237f34LL;
		pcmpeqw(ma, mb);
		fprintf(stdout, "pcmpeqw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpeqw: mb.q is 00000000ffffffff\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x93009a568f237f34LL;
		pcmpeqb(ma, mb);
		fprintf(stdout, "pcmpeqb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpeqb: mb.q is 00ff00ffffffffff\n");
		fflush(stdout); fflush(stderr);



		/* PCMPGT ***************************************************/
		ma.q = 0x666688884477aaffLL;
		mb.q = 0x1234567890abcdefLL;

		pcmpgtd(ma, mb);
		fprintf(stdout, "pcmpgtd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpgtd: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x1234567890abcdefLL;
		pcmpgtw(ma, mb);
		fprintf(stdout, "pcmpgtw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpgtw: mb.q is 0000ffff0000ffff\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x1234567890abcdefLL;
		pcmpgtb(ma, mb);
		fprintf(stdout, "pcmpgtb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpgtb: mb.q is 0000ffff0000ff00\n");
		fflush(stdout); fflush(stderr);


		/* PACKSS ***************************************************/
		ma.q = 0x00012222000abbbbLL;
		mb.q = 0x0000888800003333LL;

		packssdw(ma, mb);
		fprintf(stdout, "packssdw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "packssdw: mb.q is 7fff7fff7fff3333\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x00aa00dd01009999LL;
		mb.q = 0x0011002200330044LL;

		packsswb(ma, mb);
		fprintf(stdout, "packsswb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "packsswb: mb.q is 7f7f7f8011223344\n");
		fflush(stdout); fflush(stderr);


		/* PACKUS ***************************************************/
		ma.q = 0x00aa00dd01009999LL;
		mb.q = 0x0011002200330044LL;

		packuswb(ma, mb);
		fprintf(stdout, "packuswb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "packuswb: mb.q is aaddff0011223344\n");
		fflush(stdout); fflush(stderr);


		/* PUNPCKH **************************************************/
		ma.q = 0x090a0b0c0d0e0f00LL;
		mb.q = 0x0102030405060708LL;

		punpckhdq(ma, mb);
		fprintf(stdout, "punpckhdq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckhdq: mb.q is 090a0b0c01020304\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpckhwd(ma, mb);
		fprintf(stdout, "punpckhwd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckhwd: mb.q is 090a01020b0c0304\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpckhbw(ma, mb);
		fprintf(stdout, "punpckhbw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckhbw: mb.q is 09010a020b030c04\n");
		fflush(stdout); fflush(stderr);


		/* PUNPCKL **************************************************/
		ma.q = 0x090a0b0c0d0e0f00LL;
		mb.q = 0x0102030405060708LL;

		punpckldq(ma, mb);
		fprintf(stdout, "punpckldq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckldq: mb.q is 0d0e0f0005060708\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpcklwd(ma, mb);
		fprintf(stdout, "punpcklwd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpcklwd: mb.q is 0d0e05060f000708\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpcklbw(ma, mb);
		fprintf(stdout, "punpcklbw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpcklbw: mb.q is 0d050e060f070008\n");
		fflush(stdout); fflush(stderr);



		/* PAND, PANDN, POR, PXOR ***********************************/
		ma.q = 0x5555555555555555LL;
		mb.q = 0x3333333333333333LL;

		pand(ma, mb);
		fprintf(stdout, "pand: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pand: mb.q is 1111111111111111\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x3333333333333333LL;
		pandn(ma, mb);
		fprintf(stdout, "pandn: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pandn: mb.q is 4444444444444444\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x3333333333333333LL;
		por(ma, mb);
		fprintf(stdout, "por: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "por: mb.q is 7777777777777777\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x3333333333333333LL;
		pxor(ma, mb);
		fprintf(stdout, "pxor: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pxor: mb.q is 6666666666666666\n");
		fflush(stdout); fflush(stderr);



		/* PSLL *****************************************************/
		ma.q = 0x0000000000000018LL;
		mb.q = 0x0123456789abcdefLL;

		psllq(ma, mb);
		fprintf(stdout, "psllq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psllq: mb.q is 6789abcdef000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		pslld(ma, mb);
		fprintf(stdout, "pslld: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pslld: mb.q is 67000000ef000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psllw(ma, mb);
		fprintf(stdout, "psllw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psllw: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);



		/* PSRL *****************************************************/
		ma.q = 0x0000000000000018LL;
		mb.q = 0x0123456789abcdefLL;

		psrlq(ma, mb);
		fprintf(stdout, "psrlq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrlq: mb.q is 0000000123456789\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psrld(ma, mb);
		fprintf(stdout, "psrld: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrld: mb.q is 0000000100000089\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psrlw(ma, mb);
		fprintf(stdout, "psrlw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrlw: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);



		/* PSRA *****************************************************/
		ma.q = 0x0000000000000018LL;
		mb.q = 0x0123456789abcdefLL;

		psrad(ma, mb);
		fprintf(stdout, "psrad: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrad: mb.q is 00000001ffffff89\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psraw(ma, mb);
		fprintf(stdout, "psraw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psraw: mb.q is 00000000ffffffff\n");
		fflush(stdout); fflush(stderr);

		/* Exit MXX *************************************************/
		emms();
	}

	/* Clean-up and exit nicely */
	exit(0);
}
Exemple #29
0
VLC_MMX
static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top,
                                       const picture_t* p_pic_bot )
{
    assert( p_pic_top->i_planes == p_pic_bot->i_planes );

    /* Amount of bits must be known for MMX, thus int32_t.
       Doesn't hurt the C implementation. */
    int32_t i_score_mmx = 0; /* this must be divided by 255 when finished  */
    int32_t i_score_c   = 0; /* this counts as-is (used for non-MMX parts) */

    pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */

    for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )
    {
        /* Sanity check */
        if( p_pic_top->p[i_plane].i_visible_lines !=
            p_pic_bot->p[i_plane].i_visible_lines )
            return -1;

        const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1;
        const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch,
                             p_pic_bot->p[i_plane].i_visible_pitch );
        const int wm8 = w % 8;   /* remainder */
        const int w8  = w - wm8; /* part of width that is divisible by 8 */

        /* Current line / neighbouring lines picture pointers */
        const picture_t *cur = p_pic_bot;
        const picture_t *ngh = p_pic_top;
        int wc = cur->p[i_plane].i_pitch;
        int wn = ngh->p[i_plane].i_pitch;

        /* Transcode 1.1.5 only checks every other line. Checking every line
           works better for anime, which may contain horizontal,
           one pixel thick cartoon outlines.
        */
        for( int y = 1; y < i_lasty; ++y )
        {
            uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc];     /* this line */
            uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */
            uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */

            int x = 0;

            /* Easy-to-read C version further below.

               Assumptions: 0 < T < 127
                            # of pixels < (2^32)/255
               Note: calculates score * 255
            */
            static alignas (8) const mmx_t b0 = {
                .uq = 0x0000000000000000ULL };
            static alignas (8) const mmx_t b128 = {
                .uq = 0x8080808080808080ULL };
            static alignas (8) const mmx_t bT = {
                .ub = { T, T, T, T, T, T, T, T } };

            for( ; x < w8; x += 8 )
            {
                movq_m2r( *((int64_t*)p_c), mm0 );
                movq_m2r( *((int64_t*)p_p), mm1 );
                movq_m2r( *((int64_t*)p_n), mm2 );

                psubb_m2r( b128, mm0 );
                psubb_m2r( b128, mm1 );
                psubb_m2r( b128, mm2 );

                psubsb_r2r( mm0, mm1 );
                psubsb_r2r( mm0, mm2 );

                pxor_r2r( mm3, mm3 );
                pxor_r2r( mm4, mm4 );
                pxor_r2r( mm5, mm5 );
                pxor_r2r( mm6, mm6 );

                punpcklbw_r2r( mm1, mm3 );
                punpcklbw_r2r( mm2, mm4 );
                punpckhbw_r2r( mm1, mm5 );
                punpckhbw_r2r( mm2, mm6 );

                pmulhw_r2r( mm3, mm4 );
                pmulhw_r2r( mm5, mm6 );

                packsswb_r2r(mm4, mm6);
                pcmpgtb_m2r( bT, mm6 );
                psadbw_m2r( b0, mm6 );
                paddd_r2r( mm6, mm7 );

                p_c += 8;
                p_p += 8;
                p_n += 8;
            }

            for( ; x < w; ++x )
            {
                /* Worst case: need 17 bits for "comb". */
                int_fast32_t C = *p_c;
                int_fast32_t P = *p_p;
                int_fast32_t N = *p_n;

                /* Comments in Transcode's filter_ivtc.c attribute this
                   combing metric to Gunnar Thalin.

                    The idea is that if the picture is interlaced, both
                    expressions will have the same sign, and this comes
                    up positive. The value T = 100 has been chosen such
                    that a pixel difference of 10 (on average) will
                    trigger the detector.
                */
                int_fast32_t comb = (P - C) * (N - C);
                if( comb > T )
                    ++i_score_c;

                ++p_c;
                ++p_p;
                ++p_n;
            }

            /* Now the other field - swap current and neighbour pictures */
            const picture_t *tmp = cur;
            cur = ngh;
            ngh = tmp;
            int tmp_pitch = wc;
            wc = wn;
            wn = tmp_pitch;
        }
    }

    movd_r2m( mm7, i_score_mmx );
    emms();

    return i_score_mmx/255 + i_score_c;
}
#endif

/* See header for function doc. */
int CalculateInterlaceScore( const picture_t* p_pic_top,
                             const picture_t* p_pic_bot )
{
    /*
        We use the comb metric from the IVTC filter of Transcode 1.1.5.
        This was found to work better for the particular purpose of IVTC
        than RenderX()'s comb metric.

        Note that we *must not* subsample at all in order to catch interlacing
        in telecined frames with localized motion (e.g. anime with characters
        talking, where only mouths move and everything else stays still.)
    */

    assert( p_pic_top != NULL );
    assert( p_pic_bot != NULL );

    if( p_pic_top->i_planes != p_pic_bot->i_planes )
        return -1;

#ifdef CAN_COMPILE_MMXEXT
    if (vlc_CPU_MMXEXT())
        return CalculateInterlaceScoreMMX( p_pic_top, p_pic_bot );
#endif

    int32_t i_score = 0;

    for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )
    {
        /* Sanity check */
        if( p_pic_top->p[i_plane].i_visible_lines !=
            p_pic_bot->p[i_plane].i_visible_lines )
            return -1;

        const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1;
        const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch,
                             p_pic_bot->p[i_plane].i_visible_pitch );

        /* Current line / neighbouring lines picture pointers */
        const picture_t *cur = p_pic_bot;
        const picture_t *ngh = p_pic_top;
        int wc = cur->p[i_plane].i_pitch;
        int wn = ngh->p[i_plane].i_pitch;

        /* Transcode 1.1.5 only checks every other line. Checking every line
           works better for anime, which may contain horizontal,
           one pixel thick cartoon outlines.
        */
        for( int y = 1; y < i_lasty; ++y )
        {
            uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc];     /* this line */
            uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */
            uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */

            for( int x = 0; x < w; ++x )
            {
                /* Worst case: need 17 bits for "comb". */
                int_fast32_t C = *p_c;
                int_fast32_t P = *p_p;
                int_fast32_t N = *p_n;

                /* Comments in Transcode's filter_ivtc.c attribute this
                   combing metric to Gunnar Thalin.

                    The idea is that if the picture is interlaced, both
                    expressions will have the same sign, and this comes
                    up positive. The value T = 100 has been chosen such
                    that a pixel difference of 10 (on average) will
                    trigger the detector.
                */
                int_fast32_t comb = (P - C) * (N - C);
                if( comb > T )
                    ++i_score;

                ++p_c;
                ++p_p;
                ++p_n;
            }

            /* Now the other field - swap current and neighbour pictures */
            const picture_t *tmp = cur;
            cur = ngh;
            ngh = tmp;
            int tmp_pitch = wc;
            wc = wn;
            wn = tmp_pitch;
        }
    }

    return i_score;
}
Exemple #30
0
VLC_MMX
static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c,
                                    int i_pitch_prev, int i_pitch_curr,
                                    int* pi_top, int* pi_bot )
{
    int32_t i_motion = 0;
    int32_t i_top_motion = 0;
    int32_t i_bot_motion = 0;

    static alignas (8) const mmx_t bT   = { .ub = { T, T, T, T, T, T, T, T } };
    pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */
    movq_m2r( bT,  mm5 );

    pxor_r2r( mm3, mm3 ); /* score (top field) */
    pxor_r2r( mm4, mm4 ); /* score (bottom field) */
    for( int y = 0; y < 8; y+=2 )
    {
        /* top field */
        movq_m2r( *((uint64_t*)p_pix_c), mm0 );
        movq_m2r( *((uint64_t*)p_pix_p), mm1 );
        movq_r2r( mm0, mm2 );
        psubusb_r2r( mm1, mm2 );
        psubusb_r2r( mm0, mm1 );

        pcmpgtb_r2r( mm5, mm2 );
        pcmpgtb_r2r( mm5, mm1 );
        psadbw_r2r(  mm6, mm2 );
        psadbw_r2r(  mm6, mm1 );

        paddd_r2r( mm2, mm1 );
        paddd_r2r( mm1, mm3 ); /* add to top field score */

        p_pix_c += i_pitch_curr;
        p_pix_p += i_pitch_prev;

        /* bottom field - handling identical to top field, except... */
        movq_m2r( *((uint64_t*)p_pix_c), mm0 );
        movq_m2r( *((uint64_t*)p_pix_p), mm1 );
        movq_r2r( mm0, mm2 );
        psubusb_r2r( mm1, mm2 );
        psubusb_r2r( mm0, mm1 );

        pcmpgtb_r2r( mm5, mm2 );
        pcmpgtb_r2r( mm5, mm1 );
        psadbw_r2r(  mm6, mm2 );
        psadbw_r2r(  mm6, mm1 );

        paddd_r2r( mm2, mm1 );
        paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */

        p_pix_c += i_pitch_curr;
        p_pix_p += i_pitch_prev;
    }
    movq_r2r(  mm3, mm7 ); /* score (total) */
    paddd_r2r( mm4, mm7 );
    movd_r2m( mm3, i_top_motion );
    movd_r2m( mm4, i_bot_motion );
    movd_r2m( mm7, i_motion );

    /* The loop counts actual score * 255. */
    i_top_motion /= 255;
    i_bot_motion /= 255;
    i_motion     /= 255;

    emms();

    (*pi_top) = ( i_top_motion >= 8 );
    (*pi_bot) = ( i_bot_motion >= 8 );
    return (i_motion >= 8);
}