static void scale_uint8_x_4_x_bilinear_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start) { int i; uint8_t * src, * dst, *src_start; int32_t * factors; // mmx_t tmp_mm; /* * mm0: Input1 * mm1: Factor mask * mm2: * mm3: Output * mm4: * mm5: Input2 * mm6: 0 * mm7: Factor * */ // fprintf(stderr, "scale_uint8_x_4_x_bilinear_mmx\n"); src_start = ctx->src + scanline * ctx->src_stride; pxor_r2r(mm6, mm6); movq_m2r(factor_mask, mm1); dst = dest_start; for(i = 0; i < ctx->dst_size; i++) { src = src_start + 4*ctx->table_h.pixels[i].index; factors = ctx->table_h.pixels[i].factor_i; /* Load pixels */ movd_m2r(*(src), mm0); punpcklbw_r2r(mm6, mm0); psllw_i2r(6, mm0); /* 14 bit */ /* Load pixels */ movd_m2r(*(src+4), mm5); punpcklbw_r2r(mm6, mm5); psllw_i2r(6, mm5); /* 14 bit */ /* Load factors */ LOAD_FACTOR_1_4_NOCLIP; /* 14 bit */ /* Subtract */ psubsw_r2r(mm5, mm0); /* s1(mm0) - s2(mm5) -> mm0 (14 bit) */ pmulhw_r2r(mm7, mm0); /* factor * (s2 - s1) -> mm0 (12 bit) */ psllw_i2r(2, mm0); /* (14 bit) */ paddsw_r2r(mm5, mm0);/* (15 bit) */ psraw_i2r(6, mm0);/* (8 bit) */ packuswb_r2r(mm6, mm0); movd_r2m(mm0, *dst); dst+=4; } ctx->need_emms = 1; }
static void _evas_yv12torgb_sse(unsigned char **yuv, unsigned char *rgb, int w, int h) { #ifdef BUILD_MMX int xx, yy; register unsigned char *yp1, *up, *vp; unsigned char *dp1; /* destination pointers */ dp1 = rgb; for (yy = 0; yy < h; yy++) { /* plane pointers */ yp1 = yuv[yy]; up = yuv[h + (yy / 2)]; vp = yuv[h + (h / 2) + (yy / 2)]; for (xx = 0; xx < (w - 7); xx += 8) { movd_m2r(*up, mm3); movd_m2r(*vp, mm2); movq_m2r(*yp1, mm0); pxor_r2r(mm7, mm7); punpcklbw_r2r(mm7, mm2); punpcklbw_r2r(mm7, mm3); movq_r2r(mm0, mm1); psrlw_i2r(8, mm0); psllw_i2r(8, mm1); psrlw_i2r(8, mm1); movq_m2r(CONST_16, mm4); psubsw_r2r(mm4, mm0); psubsw_r2r(mm4, mm1); movq_m2r(CONST_128, mm5); psubsw_r2r(mm5, mm2); psubsw_r2r(mm5, mm3); movq_m2r(CONST_YMUL, mm4); pmullw_r2r(mm4, mm0); pmullw_r2r(mm4, mm1); movq_m2r(CONST_CRVCRV, mm7); pmullw_r2r(mm3, mm7); movq_m2r(CONST_CBUCBU, mm6); pmullw_r2r(mm2, mm6); movq_m2r(CONST_CGUCGU, mm5); pmullw_r2r(mm2, mm5); movq_m2r(CONST_CGVCGV, mm4); pmullw_r2r(mm3, mm4); movq_r2r(mm0, mm2); paddsw_r2r(mm7, mm2); paddsw_r2r(mm1, mm7); psraw_i2r(RES, mm2); psraw_i2r(RES, mm7); packuswb_r2r(mm7, mm2); pxor_r2r(mm7, mm7); movq_r2r(mm2, mm3); punpckhbw_r2r(mm7, mm2); punpcklbw_r2r(mm3, mm7); por_r2r(mm7, mm2); movq_r2r(mm0, mm3); psubsw_r2r(mm5, mm3); psubsw_r2r(mm4, mm3); paddsw_m2r(CONST_32, mm3); movq_r2r(mm1, mm7); psubsw_r2r(mm5, mm7); psubsw_r2r(mm4, mm7); paddsw_m2r(CONST_32, mm7); psraw_i2r(RES, mm3); psraw_i2r(RES, mm7); packuswb_r2r(mm7, mm3); pxor_r2r(mm7, mm7); movq_r2r(mm3, mm4); punpckhbw_r2r(mm7, mm3); punpcklbw_r2r(mm4, mm7); por_r2r(mm7, mm3); movq_m2r(CONST_32, mm4); paddsw_r2r(mm6, mm0); paddsw_r2r(mm6, mm1); paddsw_r2r(mm4, mm0); paddsw_r2r(mm4, mm1); psraw_i2r(RES, mm0); psraw_i2r(RES, mm1); packuswb_r2r(mm1, mm0); pxor_r2r(mm7, mm7); movq_r2r(mm0, mm5); punpckhbw_r2r(mm7, mm0); punpcklbw_r2r(mm5, mm7); por_r2r(mm7, mm0); movq_m2r(CONST_FF, mm1); movq_r2r(mm0, mm5); movq_r2r(mm3, mm6); movq_r2r(mm2, mm7); punpckhbw_r2r(mm3, mm2); punpcklbw_r2r(mm6, mm7); punpckhbw_r2r(mm1, mm0); punpcklbw_r2r(mm1, mm5); movq_r2r(mm7, mm1); punpckhwd_r2r(mm5, mm7); punpcklwd_r2r(mm5, mm1); movq_r2r(mm2, mm4); punpckhwd_r2r(mm0, mm2); punpcklwd_r2r(mm0, mm4); movntq_r2m(mm1, *(dp1)); movntq_r2m(mm7, *(dp1 + 8)); movntq_r2m(mm4, *(dp1 + 16)); movntq_r2m(mm2, *(dp1 + 24)); yp1 += 8; up += 4; vp += 4; dp1 += 8 * 4; } /* cleanup pixles that arent a multiple of 8 pixels wide */ if (xx < w) { int y, u, v, r, g, b; for (; xx < w; xx += 2) { u = (*up++) - 128; v = (*vp++) - 128; y = RZ(YMUL) * ((*yp1++) - 16); r = LUT_CLIP((y + (_crv * v)) >> RES); g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES); b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES); *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b); dp1 += 4; y = RZ(YMUL) * ((*yp1++) - 16); r = LUT_CLIP((y + (_crv * v)) >> RES); g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES); b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES); *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b); dp1 += 4; } } }
void yuv411planar_to_rgb_mmx (const unsigned char *yuv, unsigned char *rgb, unsigned int w, unsigned int h) { unsigned int xx, yy; register const unsigned char *yp1, *up, *vp; unsigned char *dp1; /* plane pointers */ yp1 = yuv; up = yuv + (w * h); vp = up + (w * (h / 4)); /* destination pointers */ dp1 = rgb; yp1 = yuv; up = yuv + (w * h); vp = up + ((w / 2) * (h / 2)); dp1 = rgb; for (yy = 0; yy < h; yy++) { for (xx = 0; xx < w; xx += 8) { movq_m2r(*yp1, mm0); movq_r2r(mm0, mm1); psrlw_i2r(8, mm0); psllw_i2r(8, mm1); psrlw_i2r(8, mm1); pxor_r2r(mm7, mm7); movd_m2r(*up, mm3); movd_m2r(*vp, mm2); punpcklbw_r2r(mm7, mm2); punpcklbw_r2r(mm7, mm3); movq_m2r(CONST_16, mm4); psubsw_r2r(mm4, mm0); psubsw_r2r(mm4, mm1); movq_m2r(CONST_128, mm5); psubsw_r2r(mm5, mm2); psubsw_r2r(mm5, mm3); movq_m2r(CONST_YMUL, mm4); pmullw_r2r(mm4, mm0); pmullw_r2r(mm4, mm1); movq_m2r(CONST_CRVCRV, mm7); pmullw_r2r(mm3, mm7); movq_m2r(CONST_CBUCBU, mm6); pmullw_r2r(mm2, mm6); movq_m2r(CONST_CGUCGU, mm5); pmullw_r2r(mm2, mm5); movq_m2r(CONST_CGVCGV, mm4); pmullw_r2r(mm3, mm4); movq_r2r(mm0, mm2); paddsw_r2r(mm7, mm2); paddsw_r2r(mm1, mm7); psraw_i2r(RES, mm2); psraw_i2r(RES, mm7); packuswb_r2r(mm7, mm2); pxor_r2r(mm7, mm7); movq_r2r(mm2, mm3); punpckhbw_r2r(mm7, mm2); punpcklbw_r2r(mm3, mm7); por_r2r(mm7, mm2); movq_r2r(mm0, mm3); psubsw_r2r(mm5, mm3); psubsw_r2r(mm4, mm3); paddsw_m2r(CONST_32, mm3); movq_r2r(mm1, mm7); psubsw_r2r(mm5, mm7); psubsw_r2r(mm4, mm7); paddsw_m2r(CONST_32, mm7); psraw_i2r(RES, mm3); psraw_i2r(RES, mm7); packuswb_r2r(mm7, mm3); pxor_r2r(mm7, mm7); movq_r2r(mm3, mm4); punpckhbw_r2r(mm7, mm3); punpcklbw_r2r(mm4, mm7); por_r2r(mm7, mm3); movq_m2r(CONST_32, mm4); paddsw_r2r(mm6, mm0); paddsw_r2r(mm6, mm1); paddsw_r2r(mm4, mm0); paddsw_r2r(mm4, mm1); psraw_i2r(RES, mm0); psraw_i2r(RES, mm1); packuswb_r2r(mm1, mm0); pxor_r2r(mm7, mm7); movq_r2r(mm0, mm5); punpckhbw_r2r(mm7, mm0); punpcklbw_r2r(mm5, mm7); por_r2r(mm7, mm0); pxor_r2r(mm1, mm1); movq_r2r(mm0, mm5); movq_r2r(mm3, mm6); movq_r2r(mm2, mm7); punpckhbw_r2r(mm3, mm2); punpcklbw_r2r(mm6, mm7); punpckhbw_r2r(mm1, mm0); punpcklbw_r2r(mm1, mm5); movq_r2r(mm7, mm1); punpckhwd_r2r(mm5, mm7); punpcklwd_r2r(mm5, mm1); movq_r2r(mm2, mm4); punpckhwd_r2r(mm0, mm2); punpcklwd_r2r(mm0, mm4); movntq_r2m(mm1, *(dp1)); movntq_r2m(mm7, *(dp1 + 8)); movntq_r2m(mm4, *(dp1 + 16)); movntq_r2m(mm2, *(dp1 + 24)); yp1 += 8; up += 4; vp += 4; dp1 += 8 * 4; } if (yy & 0x1) { up -= w / 2; vp -= w / 2; } } emms(); }