static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset) { if(((int)dest) & 15){ yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); return; } if (offset) { __asm__ volatile("movq (%0), %%xmm3\n\t" "movdqa %%xmm3, %%xmm4\n\t" "psrlq $24, %%xmm3\n\t" "psllq $40, %%xmm4\n\t" "por %%xmm4, %%xmm3\n\t" :: "r"(dither) ); } else {
static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset) { if(((uintptr_t)dest) & 15){ yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); return; } filterSize--; #define MAIN_FUNCTION \ "pxor %%xmm0, %%xmm0 \n\t" \ "punpcklbw %%xmm0, %%xmm3 \n\t" \ "movd %4, %%xmm1 \n\t" \ "punpcklwd %%xmm1, %%xmm1 \n\t" \ "punpckldq %%xmm1, %%xmm1 \n\t" \ "punpcklqdq %%xmm1, %%xmm1 \n\t" \ "psllw $3, %%xmm1 \n\t" \ "paddw %%xmm1, %%xmm3 \n\t" \ "psraw $4, %%xmm3 \n\t" \ "movdqa %%xmm3, %%xmm4 \n\t" \ "movdqa %%xmm3, %%xmm7 \n\t" \ "movl %3, %%ecx \n\t" \ "mov %0, %%"FF_REG_d" \n\t"\ "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ ".p2align 4 \n\t" /* FIXME Unroll? */\ "1: \n\t"\ "movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\ "movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\ "movdqa 16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\ "add $16, %%"FF_REG_d" \n\t"\ "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ "pmulhw %%xmm0, %%xmm2 \n\t"\ "pmulhw %%xmm0, %%xmm5 \n\t"\ "paddw %%xmm2, %%xmm3 \n\t"\ "paddw %%xmm5, %%xmm4 \n\t"\ " jnz 1b \n\t"\ "psraw $3, %%xmm3 \n\t"\ "psraw $3, %%xmm4 \n\t"\ "packuswb %%xmm4, %%xmm3 \n\t"\ "movntdq %%xmm3, (%1, %%"FF_REG_c") \n\t"\ "add $16, %%"FF_REG_c" \n\t"\ "cmp %2, %%"FF_REG_c" \n\t"\ "movdqa %%xmm7, %%xmm3 \n\t" \ "movdqa %%xmm7, %%xmm4 \n\t" \ "mov %0, %%"FF_REG_d" \n\t"\ "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ "jb 1b \n\t" if (offset) { __asm__ volatile( "movq %5, %%xmm3 \n\t" "movdqa %%xmm3, %%xmm4 \n\t" "psrlq $24, %%xmm3 \n\t" "psllq $40, %%xmm4 \n\t" "por %%xmm4, %%xmm3 \n\t" MAIN_FUNCTION :: "g" (filter), "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), "m"(filterSize), "m"(((uint64_t *) dither)[0]) : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c ); } else {