void
jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
                               JDIMENSION v_samp_factor,
                               JDIMENSION width_blocks,
                               JSAMPARRAY input_data, JSAMPARRAY output_data)
{
  int outrow, outcol;
  JDIMENSION output_cols = width_blocks * DCTSIZE;
  JSAMPROW inptr, outptr;

  __vector unsigned char this0, next0, out;
  __vector unsigned short this0e, this0o, next0e, next0o, outl, outh;

  /* Constants */
  __vector unsigned short pw_bias = { __4X2(0, 1) },
    pw_one = { __8X(1) };
  __vector unsigned char even_odd_index =
    {0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15},
    pb_zero = { __16X(0) };

  expand_right_edge(input_data, max_v_samp_factor, image_width,
                    output_cols * 2);

  for (outrow = 0; outrow < v_samp_factor; outrow++) {
    outptr = output_data[outrow];
    inptr = input_data[outrow];

    for (outcol = output_cols; outcol > 0;
         outcol -= 16, inptr += 32, outptr += 16) {

      this0 = vec_ld(0, inptr);
      this0 = vec_perm(this0, this0, even_odd_index);
      this0e = (__vector unsigned short)VEC_UNPACKHU(this0);
      this0o = (__vector unsigned short)VEC_UNPACKLU(this0);
      outl = vec_add(this0e, this0o);
      outl = vec_add(outl, pw_bias);
      outl = vec_sr(outl, pw_one);

      if (outcol > 8) {
        next0 = vec_ld(16, inptr);
        next0 = vec_perm(next0, next0, even_odd_index);
        next0e = (__vector unsigned short)VEC_UNPACKHU(next0);
        next0o = (__vector unsigned short)VEC_UNPACKLU(next0);
        outh = vec_add(next0e, next0o);
        outh = vec_add(outh, pw_bias);
        outh = vec_sr(outh, pw_one);
      } else
        outh = vec_splat_u16(0);

      out = vec_pack(outl, outh);
      vec_st(out, 0, outptr);
    }
  }
}
void
jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col,
                        DCTELEM * workspace)
{
  JSAMPROW elemptr;

  __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7;
  __vector short out0, out1, out2, out3, out4, out5, out6, out7;

  /* Constants */
  __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) };
  __vector unsigned char pb_zero = { __16X(0) };

  LOAD_ROW(0);
  LOAD_ROW(1);
  LOAD_ROW(2);
  LOAD_ROW(3);
  LOAD_ROW(4);
  LOAD_ROW(5);
  LOAD_ROW(6);
  LOAD_ROW(7);

  out0 = (__vector short)VEC_UNPACKHU(in0);
  out1 = (__vector short)VEC_UNPACKHU(in1);
  out2 = (__vector short)VEC_UNPACKHU(in2);
  out3 = (__vector short)VEC_UNPACKHU(in3);
  out4 = (__vector short)VEC_UNPACKHU(in4);
  out5 = (__vector short)VEC_UNPACKHU(in5);
  out6 = (__vector short)VEC_UNPACKHU(in6);
  out7 = (__vector short)VEC_UNPACKHU(in7);

  out0 = vec_sub(out0, pw_centerjsamp);
  out1 = vec_sub(out1, pw_centerjsamp);
  out2 = vec_sub(out2, pw_centerjsamp);
  out3 = vec_sub(out3, pw_centerjsamp);
  out4 = vec_sub(out4, pw_centerjsamp);
  out5 = vec_sub(out5, pw_centerjsamp);
  out6 = vec_sub(out6, pw_centerjsamp);
  out7 = vec_sub(out7, pw_centerjsamp);

  vec_st(out0, 0, workspace);
  vec_st(out1, 16, workspace);
  vec_st(out2, 32, workspace);
  vec_st(out3, 48, workspace);
  vec_st(out4, 64, workspace);
  vec_st(out5, 80, workspace);
  vec_st(out6, 96, workspace);
  vec_st(out7, 112, workspace);
}