Exemplo n.º 1
0
void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
                                     JSAMPARRAY input_buf,
                                     JSAMPIMAGE output_buf,
                                     JDIMENSION output_row, int num_rows)
{
  JSAMPROW inptr, outptr;
  int pitch;
  __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0}, rgb3 = {0}, rgbg0,
    rgbg1, rgbg2, rgbg3, y;
#if RGB_PIXELSIZE == 4
  __vector unsigned char rgb4;
#endif
  __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
  __vector unsigned short y01, y23;
  __vector int y0, y1, y2, y3;

  /* Constants */
  __vector short pw_f0299_f0337 = { __4X2(F_0_299, F_0_337) },
    pw_f0114_f0250 = { __4X2(F_0_114, F_0_250) };
  __vector int pd_onehalf = { __4X(ONE_HALF) };
  __vector unsigned char zero = { __16X(0) },
    shift_pack_index =
      { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};

  while (--num_rows >= 0) {
    inptr = *input_buf++;
    outptr = output_buf[0][output_row];
    output_row++;

    for (pitch = img_width * RGB_PIXELSIZE; pitch > 0;
         pitch -= RGB_PIXELSIZE * 16, inptr += RGB_PIXELSIZE * 16,
         outptr += 16) {

#if RGB_PIXELSIZE == 3
      /* Load 16 pixels == 48 bytes */
      if ((size_t)inptr & 15) {
        __vector unsigned char unaligned_shift_index;
        rgb0 = vec_ld(0, inptr);
        if (pitch > 16)
          rgb1 = vec_ld(16, inptr);
        else
          rgb1 = vec_ld(-1, inptr + pitch);
        if (pitch > 32)
          rgb2 = vec_ld(32, inptr);
        else
          rgb2 = vec_ld(-1, inptr + pitch);
        if (pitch > 48)
          rgb3 = vec_ld(48, inptr);
        else
          rgb3 = vec_ld(-1, inptr + pitch);
        unaligned_shift_index = vec_lvsl(0, inptr);
        rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
        rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
        rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
      } else {
        rgb0 = vec_ld(0, inptr);
        if (pitch > 16)
          rgb1 = vec_ld(16, inptr);
        if (pitch > 32)
          rgb2 = vec_ld(32, inptr);
      }

      /* rgb0 = R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5
       * rgb1 = G5 B5 R6 G6 B6 R7 G7 B7 R8 G8 B8 R9 G9 B9 Ra Ga
       * rgb2 = Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf Gf Bf
       *
       * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
       * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
       * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
       * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
       */
      rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX0);
      rgbg1 = vec_perm(rgb0, rgb1, (__vector unsigned char)RGBG_INDEX1);
      rgbg2 = vec_perm(rgb1, rgb2, (__vector unsigned char)RGBG_INDEX2);
      rgbg3 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX3);
#else
      /* Load 16 pixels == 64 bytes */
      if ((size_t)inptr & 15) {
        __vector unsigned char unaligned_shift_index;
        rgb0 = vec_ld(0, inptr);
        if (pitch > 16)
          rgb1 = vec_ld(16, inptr);
        else
          rgb1 = vec_ld(-1, inptr + pitch);
        if (pitch > 32)
          rgb2 = vec_ld(32, inptr);
        else
          rgb2 = vec_ld(-1, inptr + pitch);
        if (pitch > 48)
          rgb3 = vec_ld(48, inptr);
        else
          rgb3 = vec_ld(-1, inptr + pitch);
        if (pitch > 64)
          rgb4 = vec_ld(64, inptr);
        else
          rgb4 = vec_ld(-1, inptr + pitch);
        unaligned_shift_index = vec_lvsl(0, inptr);
        rgb0 = vec_perm(rgb0, rgb1, unaligned_shift_index);
        rgb1 = vec_perm(rgb1, rgb2, unaligned_shift_index);
        rgb2 = vec_perm(rgb2, rgb3, unaligned_shift_index);
        rgb3 = vec_perm(rgb3, rgb4, unaligned_shift_index);
      } else {
        rgb0 = vec_ld(0, inptr);
        if (pitch > 16)
          rgb1 = vec_ld(16, inptr);
        if (pitch > 32)
          rgb2 = vec_ld(32, inptr);
        if (pitch > 48)
          rgb3 = vec_ld(48, inptr);
      }

      /* rgb0 = R0 G0 B0 X0 R1 G1 B1 X1 R2 G2 B2 X2 R3 G3 B3 X3
       * rgb0 = R4 G4 B4 X4 R5 G5 B5 X5 R6 G6 B6 X6 R7 G7 B7 X7
       * rgb0 = R8 G8 B8 X8 R9 G9 B9 X9 Ra Ga Ba Xa Rb Gb Bb Xb
       * rgb0 = Rc Gc Bc Xc Rd Gd Bd Xd Re Ge Be Xe Rf Gf Bf Xf
       *
       * rgbg0 = R0 G0 R1 G1 R2 G2 R3 G3 B0 G0 B1 G1 B2 G2 B3 G3
       * rgbg1 = R4 G4 R5 G5 R6 G6 R7 G7 B4 G4 B5 G5 B6 G6 B7 G7
       * rgbg2 = R8 G8 R9 G9 Ra Ga Rb Gb B8 G8 B9 G9 Ba Ga Bb Gb
       * rgbg3 = Rc Gc Rd Gd Re Ge Rf Gf Bc Gc Bd Gd Be Ge Bf Gf
       */
      rgbg0 = vec_perm(rgb0, rgb0, (__vector unsigned char)RGBG_INDEX);
      rgbg1 = vec_perm(rgb1, rgb1, (__vector unsigned char)RGBG_INDEX);
      rgbg2 = vec_perm(rgb2, rgb2, (__vector unsigned char)RGBG_INDEX);
      rgbg3 = vec_perm(rgb3, rgb3, (__vector unsigned char)RGBG_INDEX);
#endif

      /* rg0 = R0 G0 R1 G1 R2 G2 R3 G3
       * bg0 = B0 G0 B1 G1 B2 G2 B3 G3
       * ...
       *
       * NOTE: We have to use vec_merge*() here because vec_unpack*() doesn't
       * support unsigned vectors.
       */
      rg0 = (__vector signed short)vec_mergeh(zero, rgbg0);
      bg0 = (__vector signed short)vec_mergel(zero, rgbg0);
      rg1 = (__vector signed short)vec_mergeh(zero, rgbg1);
      bg1 = (__vector signed short)vec_mergel(zero, rgbg1);
      rg2 = (__vector signed short)vec_mergeh(zero, rgbg2);
      bg2 = (__vector signed short)vec_mergel(zero, rgbg2);
      rg3 = (__vector signed short)vec_mergeh(zero, rgbg3);
      bg3 = (__vector signed short)vec_mergel(zero, rgbg3);

      /* (Original)
       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
       *
       * (This implementation)
       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
       */

      /* Calculate Y values */

      y0 = vec_msums(rg0, pw_f0299_f0337, pd_onehalf);
      y1 = vec_msums(rg1, pw_f0299_f0337, pd_onehalf);
      y2 = vec_msums(rg2, pw_f0299_f0337, pd_onehalf);
      y3 = vec_msums(rg3, pw_f0299_f0337, pd_onehalf);
      y0 = vec_msums(bg0, pw_f0114_f0250, y0);
      y1 = vec_msums(bg1, pw_f0114_f0250, y1);
      y2 = vec_msums(bg2, pw_f0114_f0250, y2);
      y3 = vec_msums(bg3, pw_f0114_f0250, y3);
      /* Clever way to avoid 4 shifts + 2 packs.  This packs the high word from
       * each dword into a new 16-bit vector, which is the equivalent of
       * descaling the 32-bit results (right-shifting by 16 bits) and then
       * packing them.
       */
      y01 = vec_perm((__vector unsigned short)y0, (__vector unsigned short)y1,
                     shift_pack_index);
      y23 = vec_perm((__vector unsigned short)y2, (__vector unsigned short)y3,
                     shift_pack_index);
      y = vec_pack(y01, y23);
      vec_st(y, 0, outptr);
    }
  }
}
Exemplo n.º 2
0
void
jsimd_fdct_islow_altivec (DCTELEM *data)
{
  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
    col0, col1, col2, col3, col4, col5, col6, col7,
    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
    z3, z4, z34l, z34h,
    out0, out1, out2, out3, out4, out5, out6, out7;
  __vector int z3l, z3h, z4l, z4h,
    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
    out7l, out7h;

  /* Constants */
  __vector short
    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
    pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
  __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
    descale_p2 = { __4X(DESCALE_P2) };

  /* Pass 1: process rows */

  row0 = vec_ld(0, data);
  row1 = vec_ld(16, data);
  row2 = vec_ld(32, data);
  row3 = vec_ld(48, data);
  row4 = vec_ld(64, data);
  row5 = vec_ld(80, data);
  row6 = vec_ld(96, data);
  row7 = vec_ld(112, data);

  TRANSPOSE(row, col);

  tmp0 = vec_add(col0, col7);
  tmp7 = vec_sub(col0, col7);
  tmp1 = vec_add(col1, col6);
  tmp6 = vec_sub(col1, col6);
  tmp2 = vec_add(col2, col5);
  tmp5 = vec_sub(col2, col5);
  tmp3 = vec_add(col3, col4);
  tmp4 = vec_sub(col3, col4);

  DO_FDCT_PASS1();

  /* Pass 2: process columns */

  TRANSPOSE(out, row);

  tmp0 = vec_add(row0, row7);
  tmp7 = vec_sub(row0, row7);
  tmp1 = vec_add(row1, row6);
  tmp6 = vec_sub(row1, row6);
  tmp2 = vec_add(row2, row5);
  tmp5 = vec_sub(row2, row5);
  tmp3 = vec_add(row3, row4);
  tmp4 = vec_sub(row3, row4);

  DO_FDCT_PASS2();

  vec_st(out0, 0, data);
  vec_st(out1, 16, data);
  vec_st(out2, 32, data);
  vec_st(out3, 48, data);
  vec_st(out4, 64, data);
  vec_st(out5, 80, data);
  vec_st(out6, 96, data);
  vec_st(out7, 112, data);
}
Exemplo n.º 3
0
void
jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block,
                          JSAMPARRAY output_buf, JDIMENSION output_col)
{
    short *dct_table = (short *)dct_table_;
    int *outptr;

    __vector short row0, row1, row2, row3, row4, row5, row6, row7,
             col0, col1, col2, col3, col4, col5, col6, col7,
             quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
             tmp0, tmp1, tmp2, tmp3, z3, z4,
             z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
             row71l, row71h, row26l, row26h, row53l, row53h,
             out0, out1, out2, out3, out4, out5, out6, out7;
    __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
             tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
             z3l, z3h, z4l, z4h,
             out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
             out5l, out5h, out6l, out6h, out7l, out7h;
    __vector signed char outb;

    /* Constants */
    __vector short pw_zero = { __8X(0) },
                   pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
                   pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
                   pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
                   pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
                   pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
                   pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
                   pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
                   pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
    __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
    __vector int pd_zero = { __4X(0) },
                 pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
                 pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
    __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
                          descale_p2 = { __4X(DESCALE_P2) },
                          const_bits = { __4X(CONST_BITS) };
    __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };

    /* Pass 1: process columns */

    col0 = vec_ld(0, coef_block);
    col1 = vec_ld(16, coef_block);
    col2 = vec_ld(32, coef_block);
    col3 = vec_ld(48, coef_block);
    col4 = vec_ld(64, coef_block);
    col5 = vec_ld(80, coef_block);
    col6 = vec_ld(96, coef_block);
    col7 = vec_ld(112, coef_block);

    tmp1 = vec_or(col1, col2);
    tmp2 = vec_or(col3, col4);
    tmp1 = vec_or(tmp1, tmp2);
    tmp3 = vec_or(col5, col6);
    tmp3 = vec_or(tmp3, col7);
    tmp1 = vec_or(tmp1, tmp3);

    quant0 = vec_ld(0, dct_table);
    col0 = vec_mladd(col0, quant0, pw_zero);

    if (vec_all_eq(tmp1, pw_zero)) {
        /* AC terms all zero */

        col0 = vec_sl(col0, pass1_bits);

        row0 = vec_splat(col0, 0);
        row1 = vec_splat(col0, 1);
        row2 = vec_splat(col0, 2);
        row3 = vec_splat(col0, 3);
        row4 = vec_splat(col0, 4);
        row5 = vec_splat(col0, 5);
        row6 = vec_splat(col0, 6);
        row7 = vec_splat(col0, 7);

    } else {

        quant1 = vec_ld(16, dct_table);
        quant2 = vec_ld(32, dct_table);
        quant3 = vec_ld(48, dct_table);
        quant4 = vec_ld(64, dct_table);
        quant5 = vec_ld(80, dct_table);
        quant6 = vec_ld(96, dct_table);
        quant7 = vec_ld(112, dct_table);

        col1 = vec_mladd(col1, quant1, pw_zero);
        col2 = vec_mladd(col2, quant2, pw_zero);
        col3 = vec_mladd(col3, quant3, pw_zero);
        col4 = vec_mladd(col4, quant4, pw_zero);
        col5 = vec_mladd(col5, quant5, pw_zero);
        col6 = vec_mladd(col6, quant6, pw_zero);
        col7 = vec_mladd(col7, quant7, pw_zero);

        DO_IDCT(col, 1);

        TRANSPOSE(out, row);
    }

    /* Pass 2: process rows */

    DO_IDCT(row, 2);

    TRANSPOSE(out, col);

    outb = vec_packs(col0, col0);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[0] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col1, col1);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[1] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col2, col2);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[2] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col3, col3);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[3] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col4, col4);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[4] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col5, col5);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[5] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col6, col6);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[6] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col7, col7);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[7] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);
}