Example #1
0
void
jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
                               JDIMENSION v_samp_factor,
                               JDIMENSION width_blocks,
                               JSAMPARRAY input_data, JSAMPARRAY output_data)
{
  int outrow, outcol;
  JDIMENSION output_cols = width_blocks * DCTSIZE;
  JSAMPROW inptr, outptr;

  __vector unsigned char this0, next0, out;
  __vector unsigned short this0e, this0o, next0e, next0o, outl, outh;

  /* Constants */
  __vector unsigned short pw_bias = { __4X2(0, 1) },
    pw_one = { __8X(1) };
  __vector unsigned char even_odd_index =
    {0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15},
    pb_zero = { __16X(0) };

  expand_right_edge(input_data, max_v_samp_factor, image_width,
                    output_cols * 2);

  for (outrow = 0; outrow < v_samp_factor; outrow++) {
    outptr = output_data[outrow];
    inptr = input_data[outrow];

    for (outcol = output_cols; outcol > 0;
         outcol -= 16, inptr += 32, outptr += 16) {

      this0 = vec_ld(0, inptr);
      this0 = vec_perm(this0, this0, even_odd_index);
      this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
      this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
      outl = vec_add(this0e, this0o);
      outl = vec_add(outl, pw_bias);
      outl = vec_sr(outl, pw_one);

      if (outcol > 8) {
        next0 = vec_ld(16, inptr);
        next0 = vec_perm(next0, next0, even_odd_index);
        next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
        next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
        outh = vec_add(next0e, next0o);
        outh = vec_add(outh, pw_bias);
        outh = vec_sr(outh, pw_one);
      } else
        outh = vec_splat_u16(0);

      out = vec_pack(outl, outh);
      vec_st(out, 0, outptr);
    }
  }
}
Example #2
0
void
jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col,
                        DCTELEM * workspace)
{
  JSAMPROW elemptr;

  __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7;
  __vector short out0, out1, out2, out3, out4, out5, out6, out7;

  /* Constants */
  __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) };
  __vector unsigned char pb_zero = { __16X(0) };

  LOAD_ROW(0);
  LOAD_ROW(1);
  LOAD_ROW(2);
  LOAD_ROW(3);
  LOAD_ROW(4);
  LOAD_ROW(5);
  LOAD_ROW(6);
  LOAD_ROW(7);

  out0 = (__vector short)VEC_UNPACKHU(in0);
  out1 = (__vector short)VEC_UNPACKHU(in1);
  out2 = (__vector short)VEC_UNPACKHU(in2);
  out3 = (__vector short)VEC_UNPACKHU(in3);
  out4 = (__vector short)VEC_UNPACKHU(in4);
  out5 = (__vector short)VEC_UNPACKHU(in5);
  out6 = (__vector short)VEC_UNPACKHU(in6);
  out7 = (__vector short)VEC_UNPACKHU(in7);

  out0 = vec_sub(out0, pw_centerjsamp);
  out1 = vec_sub(out1, pw_centerjsamp);
  out2 = vec_sub(out2, pw_centerjsamp);
  out3 = vec_sub(out3, pw_centerjsamp);
  out4 = vec_sub(out4, pw_centerjsamp);
  out5 = vec_sub(out5, pw_centerjsamp);
  out6 = vec_sub(out6, pw_centerjsamp);
  out7 = vec_sub(out7, pw_centerjsamp);

  vec_st(out0, 0, workspace);
  vec_st(out1, 16, workspace);
  vec_st(out2, 32, workspace);
  vec_st(out3, 48, workspace);
  vec_st(out4, 64, workspace);
  vec_st(out5, 80, workspace);
  vec_st(out6, 96, workspace);
  vec_st(out7, 112, workspace);
}
void
jsimd_fdct_islow_altivec (DCTELEM *data)
{
  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
    col0, col1, col2, col3, col4, col5, col6, col7,
    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
    z3, z4, z34l, z34h,
    out0, out1, out2, out3, out4, out5, out6, out7;
  __vector int z3l, z3h, z4l, z4h,
    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
    out7l, out7h;

  /* Constants */
  __vector short
    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
    pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
  __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
    descale_p2 = { __4X(DESCALE_P2) };

  /* Pass 1: process rows */

  row0 = vec_ld(0, data);
  row1 = vec_ld(16, data);
  row2 = vec_ld(32, data);
  row3 = vec_ld(48, data);
  row4 = vec_ld(64, data);
  row5 = vec_ld(80, data);
  row6 = vec_ld(96, data);
  row7 = vec_ld(112, data);

  TRANSPOSE(row, col);

  tmp0 = vec_add(col0, col7);
  tmp7 = vec_sub(col0, col7);
  tmp1 = vec_add(col1, col6);
  tmp6 = vec_sub(col1, col6);
  tmp2 = vec_add(col2, col5);
  tmp5 = vec_sub(col2, col5);
  tmp3 = vec_add(col3, col4);
  tmp4 = vec_sub(col3, col4);

  DO_FDCT_PASS1();

  /* Pass 2: process columns */

  TRANSPOSE(out, row);

  tmp0 = vec_add(row0, row7);
  tmp7 = vec_sub(row0, row7);
  tmp1 = vec_add(row1, row6);
  tmp6 = vec_sub(row1, row6);
  tmp2 = vec_add(row2, row5);
  tmp5 = vec_sub(row2, row5);
  tmp3 = vec_add(row3, row4);
  tmp4 = vec_sub(row3, row4);

  DO_FDCT_PASS2();

  vec_st(out0, 0, data);
  vec_st(out1, 16, data);
  vec_st(out2, 32, data);
  vec_st(out3, 48, data);
  vec_st(out4, 64, data);
  vec_st(out5, 80, data);
  vec_st(out6, 96, data);
  vec_st(out7, 112, data);
}
Example #4
0
void
jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM * divisors,
                        DCTELEM * workspace)
{
  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
    row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
    corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
    recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
    scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
  __vector unsigned int tmpe, tmpo;

  /* Constants */
  __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
#if __BIG_ENDIAN__
  __vector unsigned char shift_pack_index =
    {0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29};
#else
  __vector unsigned char shift_pack_index =
    {2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31};
#endif

  row0 = vec_ld(0, workspace);
  row1 = vec_ld(16, workspace);
  row2 = vec_ld(32, workspace);
  row3 = vec_ld(48, workspace);
  row4 = vec_ld(64, workspace);
  row5 = vec_ld(80, workspace);
  row6 = vec_ld(96, workspace);
  row7 = vec_ld(112, workspace);

  /* Branch-less absolute value */
  row0s = vec_sra(row0, pw_word_bit_m1);
  row1s = vec_sra(row1, pw_word_bit_m1);
  row2s = vec_sra(row2, pw_word_bit_m1);
  row3s = vec_sra(row3, pw_word_bit_m1);
  row4s = vec_sra(row4, pw_word_bit_m1);
  row5s = vec_sra(row5, pw_word_bit_m1);
  row6s = vec_sra(row6, pw_word_bit_m1);
  row7s = vec_sra(row7, pw_word_bit_m1);
  row0 = vec_xor(row0, row0s);
  row1 = vec_xor(row1, row1s);
  row2 = vec_xor(row2, row2s);
  row3 = vec_xor(row3, row3s);
  row4 = vec_xor(row4, row4s);
  row5 = vec_xor(row5, row5s);
  row6 = vec_xor(row6, row6s);
  row7 = vec_xor(row7, row7s);
  row0 = vec_sub(row0, row0s);
  row1 = vec_sub(row1, row1s);
  row2 = vec_sub(row2, row2s);
  row3 = vec_sub(row3, row3s);
  row4 = vec_sub(row4, row4s);
  row5 = vec_sub(row5, row5s);
  row6 = vec_sub(row6, row6s);
  row7 = vec_sub(row7, row7s);

  corr0 = vec_ld(DCTSIZE2 * 2, divisors);
  corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors);
  corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors);
  corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors);
  corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors);
  corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors);
  corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors);
  corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors);

  row0 = vec_add(row0, corr0);
  row1 = vec_add(row1, corr1);
  row2 = vec_add(row2, corr2);
  row3 = vec_add(row3, corr3);
  row4 = vec_add(row4, corr4);
  row5 = vec_add(row5, corr5);
  row6 = vec_add(row6, corr6);
  row7 = vec_add(row7, corr7);

  recip0 = vec_ld(0, divisors);
  recip1 = vec_ld(16, divisors);
  recip2 = vec_ld(32, divisors);
  recip3 = vec_ld(48, divisors);
  recip4 = vec_ld(64, divisors);
  recip5 = vec_ld(80, divisors);
  recip6 = vec_ld(96, divisors);
  recip7 = vec_ld(112, divisors);

  MULTIPLY(row0, recip0, row0);
  MULTIPLY(row1, recip1, row1);
  MULTIPLY(row2, recip2, row2);
  MULTIPLY(row3, recip3, row3);
  MULTIPLY(row4, recip4, row4);
  MULTIPLY(row5, recip5, row5);
  MULTIPLY(row6, recip6, row6);
  MULTIPLY(row7, recip7, row7);

  scale0 = vec_ld(DCTSIZE2 * 4, divisors);
  scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors);
  scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors);
  scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors);
  scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors);
  scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors);
  scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors);
  scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors);

  MULTIPLY(row0, scale0, row0);
  MULTIPLY(row1, scale1, row1);
  MULTIPLY(row2, scale2, row2);
  MULTIPLY(row3, scale3, row3);
  MULTIPLY(row4, scale4, row4);
  MULTIPLY(row5, scale5, row5);
  MULTIPLY(row6, scale6, row6);
  MULTIPLY(row7, scale7, row7);

  row0 = vec_xor(row0, row0s);
  row1 = vec_xor(row1, row1s);
  row2 = vec_xor(row2, row2s);
  row3 = vec_xor(row3, row3s);
  row4 = vec_xor(row4, row4s);
  row5 = vec_xor(row5, row5s);
  row6 = vec_xor(row6, row6s);
  row7 = vec_xor(row7, row7s);
  row0 = vec_sub(row0, row0s);
  row1 = vec_sub(row1, row1s);
  row2 = vec_sub(row2, row2s);
  row3 = vec_sub(row3, row3s);
  row4 = vec_sub(row4, row4s);
  row5 = vec_sub(row5, row5s);
  row6 = vec_sub(row6, row6s);
  row7 = vec_sub(row7, row7s);

  vec_st(row0, 0, coef_block);
  vec_st(row1, 16, coef_block);
  vec_st(row2, 32, coef_block);
  vec_st(row3, 48, coef_block);
  vec_st(row4, 64, coef_block);
  vec_st(row5, 80, coef_block);
  vec_st(row6, 96, coef_block);
  vec_st(row7, 112, coef_block);
}
void
jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block,
                          JSAMPARRAY output_buf, JDIMENSION output_col)
{
    short *dct_table = (short *)dct_table_;
    int *outptr;

    __vector short row0, row1, row2, row3, row4, row5, row6, row7,
             col0, col1, col2, col3, col4, col5, col6, col7,
             quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
             tmp0, tmp1, tmp2, tmp3, z3, z4,
             z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
             row71l, row71h, row26l, row26h, row53l, row53h,
             out0, out1, out2, out3, out4, out5, out6, out7;
    __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
             tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
             z3l, z3h, z4l, z4h,
             out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
             out5l, out5h, out6l, out6h, out7l, out7h;
    __vector signed char outb;

    /* Constants */
    __vector short pw_zero = { __8X(0) },
                   pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
                   pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
                   pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
                   pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
                   pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
                   pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
                   pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
                   pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
    __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
    __vector int pd_zero = { __4X(0) },
                 pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
                 pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
    __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
                          descale_p2 = { __4X(DESCALE_P2) },
                          const_bits = { __4X(CONST_BITS) };
    __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };

    /* Pass 1: process columns */

    col0 = vec_ld(0, coef_block);
    col1 = vec_ld(16, coef_block);
    col2 = vec_ld(32, coef_block);
    col3 = vec_ld(48, coef_block);
    col4 = vec_ld(64, coef_block);
    col5 = vec_ld(80, coef_block);
    col6 = vec_ld(96, coef_block);
    col7 = vec_ld(112, coef_block);

    tmp1 = vec_or(col1, col2);
    tmp2 = vec_or(col3, col4);
    tmp1 = vec_or(tmp1, tmp2);
    tmp3 = vec_or(col5, col6);
    tmp3 = vec_or(tmp3, col7);
    tmp1 = vec_or(tmp1, tmp3);

    quant0 = vec_ld(0, dct_table);
    col0 = vec_mladd(col0, quant0, pw_zero);

    if (vec_all_eq(tmp1, pw_zero)) {
        /* AC terms all zero */

        col0 = vec_sl(col0, pass1_bits);

        row0 = vec_splat(col0, 0);
        row1 = vec_splat(col0, 1);
        row2 = vec_splat(col0, 2);
        row3 = vec_splat(col0, 3);
        row4 = vec_splat(col0, 4);
        row5 = vec_splat(col0, 5);
        row6 = vec_splat(col0, 6);
        row7 = vec_splat(col0, 7);

    } else {

        quant1 = vec_ld(16, dct_table);
        quant2 = vec_ld(32, dct_table);
        quant3 = vec_ld(48, dct_table);
        quant4 = vec_ld(64, dct_table);
        quant5 = vec_ld(80, dct_table);
        quant6 = vec_ld(96, dct_table);
        quant7 = vec_ld(112, dct_table);

        col1 = vec_mladd(col1, quant1, pw_zero);
        col2 = vec_mladd(col2, quant2, pw_zero);
        col3 = vec_mladd(col3, quant3, pw_zero);
        col4 = vec_mladd(col4, quant4, pw_zero);
        col5 = vec_mladd(col5, quant5, pw_zero);
        col6 = vec_mladd(col6, quant6, pw_zero);
        col7 = vec_mladd(col7, quant7, pw_zero);

        DO_IDCT(col, 1);

        TRANSPOSE(out, row);
    }

    /* Pass 2: process rows */

    DO_IDCT(row, 2);

    TRANSPOSE(out, col);

    outb = vec_packs(col0, col0);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[0] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col1, col1);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[1] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col2, col2);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[2] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col3, col3);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[3] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col4, col4);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[4] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col5, col5);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[5] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col6, col6);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[6] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col7, col7);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[7] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);
}
void
jsimd_fdct_ifast_altivec (DCTELEM *data)
{
  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
    col0, col1, col2, col3, col4, col5, col6, col7,
    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
    z1, z2, z3, z4, z5, z11, z13,
    out0, out1, out2, out3, out4, out5, out6, out7;

  /* Constants */
  __vector short pw_zero = { __8X(0) },
    pw_0382 = { __8X(F_0_382 << CONST_SHIFT) },
    pw_0541 = { __8X(F_0_541 << CONST_SHIFT) },
    pw_0707 = { __8X(F_0_707 << CONST_SHIFT) },
    pw_1306 = { __8X(F_1_306 << CONST_SHIFT) };
  __vector unsigned short
    pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) };

  /* Pass 1: process rows */

  row0 = vec_ld(0, data);
  row1 = vec_ld(16, data);
  row2 = vec_ld(32, data);
  row3 = vec_ld(48, data);
  row4 = vec_ld(64, data);
  row5 = vec_ld(80, data);
  row6 = vec_ld(96, data);
  row7 = vec_ld(112, data);

  TRANSPOSE(row, col);

  tmp0 = vec_add(col0, col7);
  tmp7 = vec_sub(col0, col7);
  tmp1 = vec_add(col1, col6);
  tmp6 = vec_sub(col1, col6);
  tmp2 = vec_add(col2, col5);
  tmp5 = vec_sub(col2, col5);
  tmp3 = vec_add(col3, col4);
  tmp4 = vec_sub(col3, col4);

  DO_FDCT();

  /* Pass 2: process columns */

  TRANSPOSE(out, row);

  tmp0 = vec_add(row0, row7);
  tmp7 = vec_sub(row0, row7);
  tmp1 = vec_add(row1, row6);
  tmp6 = vec_sub(row1, row6);
  tmp2 = vec_add(row2, row5);
  tmp5 = vec_sub(row2, row5);
  tmp3 = vec_add(row3, row4);
  tmp4 = vec_sub(row3, row4);

  DO_FDCT();

  vec_st(out0, 0, data);
  vec_st(out1, 16, data);
  vec_st(out2, 32, data);
  vec_st(out3, 48, data);
  vec_st(out4, 64, data);
  vec_st(out5, 80, data);
  vec_st(out6, 96, data);
  vec_st(out7, 112, data);
}