void Safe2Decrypt_RIJ128(const Ipp8u* in,
                               Ipp8u* out,
                               int Nr,
                               const Ipp8u* RoundKey,
                               const void* sbox)
{
   Ipp32u state[4];

   int round=0;

   UNREFERENCED_PARAMETER(sbox);

   // copy input to the state array
   TRANSPOSE((Ipp8u*)state, in);

   // add the round key to the state before starting the rounds.
   XorRoundKey((Ipp32u*)state, (Ipp32u*)(RoundKey+Nr*16));

   // there will be Nr rounds
   for(round=Nr-1;round>0;round--) {
      invShiftRows(state);
      invSubBytes((Ipp8u*)state);
      XorRoundKey(state,(Ipp32u*)(RoundKey+round*16));
      invMixColumns(state);
    }

   // last round
   invShiftRows(state);
   invSubBytes((Ipp8u*)state);
   XorRoundKey(state,(Ipp32u*)(RoundKey+0*16));

   // copy from the state to output
   TRANSPOSE(out, (Ipp8u*)state);
}
Beispiel #2
0
/**
 * Translate RX completion flags to offload flags.
 *
 * @param[in] rxq
 *   Pointer to RX queue structure.
 * @param flags
 *   RX completion flags returned by poll_length_flags().
 *
 * @return
 *   Offload flags (ol_flags) for struct rte_mbuf.
 */
static inline uint32_t
rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
{
	uint32_t ol_flags = 0;

	if (rxq->csum)
		ol_flags |=
			TRANSPOSE(~flags,
				  IBV_EXP_CQ_RX_IP_CSUM_OK,
				  PKT_RX_IP_CKSUM_BAD) |
			TRANSPOSE(~flags,
				  IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK,
				  PKT_RX_L4_CKSUM_BAD);
	/*
	 * PKT_RX_IP_CKSUM_BAD and PKT_RX_L4_CKSUM_BAD are used in place
	 * of PKT_RX_EIP_CKSUM_BAD because the latter is not functional
	 * (its value is 0).
	 */
	if ((flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
		ol_flags |=
			TRANSPOSE(~flags,
				  IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK,
				  PKT_RX_IP_CKSUM_BAD) |
			TRANSPOSE(~flags,
				  IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK,
				  PKT_RX_L4_CKSUM_BAD);
	return ol_flags;
}
void BST<T>::remove (const T &v) throw (std::runtime_error) {
  //find the node with the value 
  TreeNode<T> *n = search (v);
  if (n == NULL) {
    throw std::runtime_error("the deleted value does not exist in the BST");
  }
  //if both left and right are NULL, just delete n
  if ((n->left==NULL) && (n->right==NULL)) {
    if (n->p == NULL) {
      delete root;
      root = NULL;
    }
    else {
      if (n == n->p->left)
        n->p->left = NULL;
      else 
        n->p->right = NULL;
      delete n;
      n = NULL;
    }
  }
  //if n->right == NULL, lift n->left
  else {
    if (n->right == NULL) 
      TRANSPOSE (n, n->right);
  
    //else lift the minimum node on the right
    else {
      TreeNode<T> *r = MINIMUM(n->right);
      ////std::cout << "r: " << r->v << std::endl;
      TRANSPOSE (n, r);
    }
  }
}
Beispiel #4
0
av_cold int ff_vp56_init_context(AVCodecContext *avctx, VP56Context *s,
                                  int flip, int has_alpha)
{
    int i;

    s->avctx = avctx;
    avctx->pix_fmt = has_alpha ? AV_PIX_FMT_YUVA420P : AV_PIX_FMT_YUV420P;
    if (avctx->skip_alpha) avctx->pix_fmt = AV_PIX_FMT_YUV420P;

    ff_h264chroma_init(&s->h264chroma, 8);
    ff_hpeldsp_init(&s->hdsp, avctx->flags);
    ff_videodsp_init(&s->vdsp, 8);
    ff_vp3dsp_init(&s->vp3dsp, avctx->flags);
    ff_vp56dsp_init(&s->vp56dsp, avctx->codec->id);
    for (i = 0; i < 64; i++) {
#define TRANSPOSE(x) (x >> 3) | ((x & 7) << 3)
        s->idct_scantable[i] = TRANSPOSE(ff_zigzag_direct[i]);
#undef TRANSPOSE
    }

    for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
        s->frames[i] = av_frame_alloc();
        if (!s->frames[i]) {
            ff_vp56_free(avctx);
            return AVERROR(ENOMEM);
        }
    }
    s->edge_emu_buffer_alloc = NULL;

    s->above_blocks = NULL;
    s->macroblocks = NULL;
    s->quantizer = -1;
    s->deblock_filtering = 1;
    s->golden_frame = 0;

    s->filter = NULL;

    s->has_alpha = has_alpha;

    s->modelp = &s->model;

    if (flip) {
        s->flip = -1;
        s->frbi = 2;
        s->srbi = 0;
    } else {
        s->flip = 1;
        s->frbi = 0;
        s->srbi = 2;
    }

    return 0;
}
Beispiel #5
0
/**
 * Translate RX completion flags to offload flags.
 *
 * @param[in] rxq
 *   Pointer to RX queue structure.
 * @param flags
 *   RX completion flags returned by poll_length_flags().
 *
 * @return
 *   Offload flags (ol_flags) for struct rte_mbuf.
 */
static inline uint32_t
rxq_cq_to_ol_flags(const struct rxq *rxq, uint32_t flags)
{
	uint32_t ol_flags = 0;

	if (rxq->csum) {
		/* Set IP checksum flag only for IPv4/IPv6 packets. */
		if (flags &
		    (IBV_EXP_CQ_RX_IPV4_PACKET | IBV_EXP_CQ_RX_IPV6_PACKET))
			ol_flags |=
				TRANSPOSE(~flags,
					IBV_EXP_CQ_RX_IP_CSUM_OK,
					PKT_RX_IP_CKSUM_BAD);
#ifdef HAVE_EXP_CQ_RX_TCP_PACKET
		/* Set L4 checksum flag only for TCP/UDP packets. */
		if (flags &
		    (IBV_EXP_CQ_RX_TCP_PACKET | IBV_EXP_CQ_RX_UDP_PACKET))
#endif /* HAVE_EXP_CQ_RX_TCP_PACKET */
			ol_flags |=
				TRANSPOSE(~flags,
					IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK,
					PKT_RX_L4_CKSUM_BAD);
	}
	/*
	 * PKT_RX_IP_CKSUM_BAD and PKT_RX_L4_CKSUM_BAD are used in place
	 * of PKT_RX_EIP_CKSUM_BAD because the latter is not functional
	 * (its value is 0).
	 */
	if ((flags & IBV_EXP_CQ_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
		ol_flags |=
			TRANSPOSE(~flags,
				  IBV_EXP_CQ_RX_OUTER_IP_CSUM_OK,
				  PKT_RX_IP_CKSUM_BAD) |
			TRANSPOSE(~flags,
				  IBV_EXP_CQ_RX_OUTER_TCP_UDP_CSUM_OK,
				  PKT_RX_L4_CKSUM_BAD);
	return ol_flags;
}
void
jsimd_fdct_islow_altivec (DCTELEM *data)
{
  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
    col0, col1, col2, col3, col4, col5, col6, col7,
    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
    z3, z4, z34l, z34h,
    out0, out1, out2, out3, out4, out5, out6, out7;
  __vector int z3l, z3h, z4l, z4h,
    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
    out7l, out7h;

  /* Constants */
  __vector short
    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
    pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
  __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
    descale_p2 = { __4X(DESCALE_P2) };

  /* Pass 1: process rows */

  row0 = vec_ld(0, data);
  row1 = vec_ld(16, data);
  row2 = vec_ld(32, data);
  row3 = vec_ld(48, data);
  row4 = vec_ld(64, data);
  row5 = vec_ld(80, data);
  row6 = vec_ld(96, data);
  row7 = vec_ld(112, data);

  TRANSPOSE(row, col);

  tmp0 = vec_add(col0, col7);
  tmp7 = vec_sub(col0, col7);
  tmp1 = vec_add(col1, col6);
  tmp6 = vec_sub(col1, col6);
  tmp2 = vec_add(col2, col5);
  tmp5 = vec_sub(col2, col5);
  tmp3 = vec_add(col3, col4);
  tmp4 = vec_sub(col3, col4);

  DO_FDCT_PASS1();

  /* Pass 2: process columns */

  TRANSPOSE(out, row);

  tmp0 = vec_add(row0, row7);
  tmp7 = vec_sub(row0, row7);
  tmp1 = vec_add(row1, row6);
  tmp6 = vec_sub(row1, row6);
  tmp2 = vec_add(row2, row5);
  tmp5 = vec_sub(row2, row5);
  tmp3 = vec_add(row3, row4);
  tmp4 = vec_sub(row3, row4);

  DO_FDCT_PASS2();

  vec_st(out0, 0, data);
  vec_st(out1, 16, data);
  vec_st(out2, 32, data);
  vec_st(out3, 48, data);
  vec_st(out4, 64, data);
  vec_st(out5, 80, data);
  vec_st(out6, 96, data);
  vec_st(out7, 112, data);
}
Beispiel #7
0
int main()
{
	srand (time(NULL));
	int i,j;
	double sigma;


	unsigned int *X = random_X (NUM_BLOCKS*NUM_CARRIERS);

	complex_table *IP_SIG = (complex_table *) malloc (sizeof(complex_table));
	complex_table_init (IP_SIG,NUM_CARRIERS,NUM_BLOCKS,"IP_SIG");
	
    create_IP_BLOCKS (IP_SIG,X,ALPHA,PI);
	
	

	complex_table *CARRIERS = (complex_table *) malloc (sizeof(complex_table));
	complex_table_init (CARRIERS,NUM_BLOCKS,NUM_CARRIERS,"CARRIERS");

	create_CARRIERS (CARRIERS);
	print_table (CARRIERS);
	


	complex_table *CARRIERST = (complex_table *) malloc (sizeof(complex_table));
	complex_table_init (CARRIERST,NUM_CARRIERS,NUM_BLOCKS,"CARRIERST");

	TRANSPOSE (CARRIERS,CARRIERST);
	//print_table (CARRIERST);
	
	
	//printf("\nrows : %d\tcolumns : %d",CARRIERST->rows,CARRIERST->columns);
	
	
	complex_table *CORRELATION = (complex_table *) malloc (sizeof(complex_table));
	complex_table_init (CORRELATION,CARRIERST->columns,CARRIERST->columns,"CORRELATION");

	CORRELATET (CARRIERST,CORRELATION);
	//print_table (CORRELATION);
	

	complex_table *MODI = (complex_table *) malloc (sizeof(complex_table));
	complex_table_init (MODI,NUM_BLOCKS,NUM_BLOCKS,"MODI");

	modulate (MODI,CARRIERS,IP_SIG);
	sigma = complex_sigma (MODI); 

	complex_table *AWGN = (complex_table *) malloc (sizeof(complex_table));
	complex_table_init (AWGN,NUM_BLOCKS*OVERSAMPLING,NUM_BLOCKS,"AWGN");
	create_noise (AWGN,Eb,sigma,OVERSAMPLING);
	
	complex_table *NOISY = (complex_table *) malloc (sizeof(complex_table));
	complex_table_init (NOISY,NUM_BLOCKS,NUM_BLOCKS*OVERSAMPLING,"NOISY");

	//	printf("\nhere\n");
	//	create_noisy (NOISY,AWGN,MODI);

	complex_addition (NOISY,AWGN,MODI);


	complex_table *INVERSE = (complex_table *) malloc (sizeof(complex_table));
	complex_table_init (INVERSE,CORRELATION->rows,CORRELATION->columns,"INVERSE");

	complex_idenity_init (INVERSE);
	
	
	complex_inverse (CORRELATION,INVERSE);
	
	//print_table (CORRELATION);

	

	
	
	/*for (i=0;i<NUM_BLOCKS;i++) {
		o = column_from_table (IP_SIG,i);
		complex_Ax (CARRIERS,SYMBOL,IP_BLOCK);
		list_to_table (MODI,SYMBOL,i);	
	}*/
	

	//inverse (CORRELATION,INVERSE);
		
	
	// send result to table :end of loop calculate sigma
	
	
	/*complex_list *IP_BLOCK = (complex_list *) malloc (sizeof(complex_list));
	complex_list_init (IP_BLOCK);*/
		
	
	//	printf("\t%d\t%d\t:\t%.5lf\t%.5lf\t\n",mynode->col,mynode->row,mynode->real,mynode->imag);

	//complex_table *SYMBOL_MODI = create_SYMBOL_MODI (CARRIERS,IP_BLOCK);

	//complex_list *SEFDM_SYMBOL = (complex_list *) malloc (sizeof(complex_list));
	//A_complex_x (CARRIERS,SEFDM_SYMBOL,IP_BLOCK);

	//double SEFDM_SQUARED = complex


	
	printf("\n\n");
	printf("NUM_CARRIERS :\t\t%d \n",NUM_CARRIERS);
	printf("MODULATION_LEVEL :\t%d \n",MODULATION_LEVEL);
	printf("NUM_BLOCKS :\t\t%d \n",NUM_BLOCKS);
	printf("ALPHA :\t\t\t%d \n",ALPHA);
	printf("OVERSAMPLING :\t\t%d \n",OVERSAMPLING);
	printf("Eb :\t\t\t%d \n",Eb);
	
	//print_table (CARRIERS);
	//print_table (CORRELATION);
	
	/*
	print_table (IP_SIG);
	print_table (CARRIERS);
	print_table (CORRELATION);
	print_table (MODI);
	print_table (AWGN);
	print_table (NOISY);
	print_table (INVERSE);
	*/
	
	return 0;
}
mlib_status
__mlib_VideoIDCT8x8_S16_S16_Q1_Mismatch(
	mlib_s16 *block,
	const mlib_s16 *coeffs)
{
	mlib_d64 *dPtr = (mlib_d64 *)coeffs;
	mlib_d64 *outPtr = (mlib_d64 *)block;
	mlib_d64 dx0, dx1, dx2, dx3, dx4, dx6, dx7, dx8;
	mlib_d64 p00, p10, p20, p30, p01, p11, p21, p31, p40, p50, p60, p70;
	mlib_d64 p41, p51, p61, p71;
	mlib_d64 t0, t1;
	mlib_d64 d0, d1, d2, d3, d7, zero = vis_fzero();

	mlib_f32 COS_1_16;
	mlib_f32 COS_2_16;
	mlib_f32 COS_6_16;
	mlib_f32 COS_7_16;
	mlib_f32 COS_4_16;
	mlib_f32 C_1_4;

/* First pass */

	LOAD_DATA_AA44;

	COS_1_16 = ((mlib_f32 *)mlib_cTable)[0];
	COS_2_16 = ((mlib_f32 *)mlib_cTable)[1];
	COS_6_16 = ((mlib_f32 *)mlib_cTable)[2];
	COS_7_16 = ((mlib_f32 *)mlib_cTable)[3];
	COS_4_16 = ((mlib_f32 *)mlib_cTable)[4];
	C_1_4 = ((mlib_f32 *)mlib_cTable)[5];

	TRANSPOSE(p00, p10, p20, p30, d0, d1, d2, d3)

		LOAD_MISMATCH;

	IDCT44(d0, d1, d2, d3);

	p00 = vis_fpadd16(dx7, dx1);
	p10 = vis_fpadd16(dx3, dx2);
	p20 = vis_fpadd16(dx0, dx4);
	p30 = vis_fpadd16(dx8, dx6);
	p40 = vis_fpsub16(dx8, dx6);
	p50 = vis_fpsub16(dx0, dx4);
	p60 = vis_fpsub16(dx3, dx2);
	p70 = vis_fpsub16(dx7, dx1);

/* Special case when element#63 == 1 */

	if (coeffs[63] != 1) {
		IDCTS(zero, d7);
		p01 = dx1;
		p11 = dx2;
		p21 = dx4;
		p31 = dx6;
		p41 = vis_fpsub16(zero, dx6);
		p51 = vis_fpsub16(zero, dx4);
		p61 = vis_fpsub16(zero, dx2);
		p71 = vis_fpsub16(zero, dx1);

		TRANSPOSE(p00, p10, p20, p30, d0, d1, d2, d3)
			TRANSPOSE1(p01, p11, p21, p31, d7)

/* Second pass */
			IDCTS1(d0, d1, d2, d3, d7);
		TRANSPOSE(p40, p50, p60, p70, d0, d1, d2, d3)
			outPtr[0] = vis_fmul8x16(C_1_4, vis_fpadd16(dx7, dx1));
		outPtr[2] = vis_fmul8x16(C_1_4, vis_fpadd16(dx3, dx2));
		outPtr[4] = vis_fmul8x16(C_1_4, vis_fpadd16(dx0, dx4));
		outPtr[6] = vis_fmul8x16(C_1_4, vis_fpadd16(dx8, dx6));
		outPtr[8] = vis_fmul8x16(C_1_4, vis_fpsub16(dx8, dx6));
		outPtr[10] = vis_fmul8x16(C_1_4, vis_fpsub16(dx0, dx4));
		outPtr[12] = vis_fmul8x16(C_1_4, vis_fpsub16(dx3, dx2));
		outPtr[14] = vis_fmul8x16(C_1_4, vis_fpsub16(dx7, dx1));

		TRANSPOSE1(p41, p51, p61, p71, d7)
			IDCTS1(d0, d1, d2, d3, d7);
		outPtr[1] = vis_fmul8x16(C_1_4, vis_fpadd16(dx7, dx1));
		outPtr[3] = vis_fmul8x16(C_1_4, vis_fpadd16(dx3, dx2));
		outPtr[5] = vis_fmul8x16(C_1_4, vis_fpadd16(dx0, dx4));
		outPtr[7] = vis_fmul8x16(C_1_4, vis_fpadd16(dx8, dx6));
		outPtr[9] = vis_fmul8x16(C_1_4, vis_fpsub16(dx8, dx6));
		outPtr[11] = vis_fmul8x16(C_1_4, vis_fpsub16(dx0, dx4));
		outPtr[13] = vis_fmul8x16(C_1_4, vis_fpsub16(dx3, dx2));
		outPtr[15] = vis_fmul8x16(C_1_4, vis_fpsub16(dx7, dx1));

		return (MLIB_SUCCESS);
	} else {
/* Second pass */
		TRANSPOSE(p00, p10, p20, p30, d0, d1, d2, d3)
			d7 = *((mlib_d64 *)&val0);

		IDCTS1(d0, d1, d2, d3, d7);
		TRANSPOSE(p40, p50, p60, p70, d0, d1, d2, d3)
			outPtr[0] = vis_fmul8x16(C_1_4, vis_fpadd16(dx7, dx1));
		outPtr[2] = vis_fmul8x16(C_1_4, vis_fpadd16(dx3, dx2));
		outPtr[4] = vis_fmul8x16(C_1_4, vis_fpadd16(dx0, dx4));
		outPtr[6] = vis_fmul8x16(C_1_4, vis_fpadd16(dx8, dx6));
		outPtr[8] = vis_fmul8x16(C_1_4, vis_fpsub16(dx8, dx6));
		outPtr[10] = vis_fmul8x16(C_1_4, vis_fpsub16(dx0, dx4));
		outPtr[12] = vis_fmul8x16(C_1_4, vis_fpsub16(dx3, dx2));
		outPtr[14] = vis_fmul8x16(C_1_4, vis_fpsub16(dx7, dx1));

		d7 = *((mlib_d64 *)&val1);
		IDCTS1(d0, d1, d2, d3, d7);
		outPtr[1] = vis_fmul8x16(C_1_4, vis_fpadd16(dx7, dx1));
		outPtr[3] = vis_fmul8x16(C_1_4, vis_fpadd16(dx3, dx2));
		outPtr[5] = vis_fmul8x16(C_1_4, vis_fpadd16(dx0, dx4));
		outPtr[7] = vis_fmul8x16(C_1_4, vis_fpadd16(dx8, dx6));
		outPtr[9] = vis_fmul8x16(C_1_4, vis_fpsub16(dx8, dx6));
		outPtr[11] = vis_fmul8x16(C_1_4, vis_fpsub16(dx0, dx4));
		outPtr[13] = vis_fmul8x16(C_1_4, vis_fpsub16(dx3, dx2));
		outPtr[15] = vis_fmul8x16(C_1_4, vis_fpsub16(dx7, dx1));

		return (MLIB_SUCCESS);
	}
}
void
jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block,
                          JSAMPARRAY output_buf, JDIMENSION output_col)
{
    short *dct_table = (short *)dct_table_;
    int *outptr;

    __vector short row0, row1, row2, row3, row4, row5, row6, row7,
             col0, col1, col2, col3, col4, col5, col6, col7,
             quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
             tmp0, tmp1, tmp2, tmp3, z3, z4,
             z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
             row71l, row71h, row26l, row26h, row53l, row53h,
             out0, out1, out2, out3, out4, out5, out6, out7;
    __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
             tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
             z3l, z3h, z4l, z4h,
             out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
             out5l, out5h, out6l, out6h, out7l, out7h;
    __vector signed char outb;

    /* Constants */
    __vector short pw_zero = { __8X(0) },
                   pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
                   pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
                   pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
                   pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
                   pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
                   pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
                   pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
                   pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
    __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
    __vector int pd_zero = { __4X(0) },
                 pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
                 pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
    __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
                          descale_p2 = { __4X(DESCALE_P2) },
                          const_bits = { __4X(CONST_BITS) };
    __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };

    /* Pass 1: process columns */

    col0 = vec_ld(0, coef_block);
    col1 = vec_ld(16, coef_block);
    col2 = vec_ld(32, coef_block);
    col3 = vec_ld(48, coef_block);
    col4 = vec_ld(64, coef_block);
    col5 = vec_ld(80, coef_block);
    col6 = vec_ld(96, coef_block);
    col7 = vec_ld(112, coef_block);

    tmp1 = vec_or(col1, col2);
    tmp2 = vec_or(col3, col4);
    tmp1 = vec_or(tmp1, tmp2);
    tmp3 = vec_or(col5, col6);
    tmp3 = vec_or(tmp3, col7);
    tmp1 = vec_or(tmp1, tmp3);

    quant0 = vec_ld(0, dct_table);
    col0 = vec_mladd(col0, quant0, pw_zero);

    if (vec_all_eq(tmp1, pw_zero)) {
        /* AC terms all zero */

        col0 = vec_sl(col0, pass1_bits);

        row0 = vec_splat(col0, 0);
        row1 = vec_splat(col0, 1);
        row2 = vec_splat(col0, 2);
        row3 = vec_splat(col0, 3);
        row4 = vec_splat(col0, 4);
        row5 = vec_splat(col0, 5);
        row6 = vec_splat(col0, 6);
        row7 = vec_splat(col0, 7);

    } else {

        quant1 = vec_ld(16, dct_table);
        quant2 = vec_ld(32, dct_table);
        quant3 = vec_ld(48, dct_table);
        quant4 = vec_ld(64, dct_table);
        quant5 = vec_ld(80, dct_table);
        quant6 = vec_ld(96, dct_table);
        quant7 = vec_ld(112, dct_table);

        col1 = vec_mladd(col1, quant1, pw_zero);
        col2 = vec_mladd(col2, quant2, pw_zero);
        col3 = vec_mladd(col3, quant3, pw_zero);
        col4 = vec_mladd(col4, quant4, pw_zero);
        col5 = vec_mladd(col5, quant5, pw_zero);
        col6 = vec_mladd(col6, quant6, pw_zero);
        col7 = vec_mladd(col7, quant7, pw_zero);

        DO_IDCT(col, 1);

        TRANSPOSE(out, row);
    }

    /* Pass 2: process rows */

    DO_IDCT(row, 2);

    TRANSPOSE(out, col);

    outb = vec_packs(col0, col0);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[0] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col1, col1);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[1] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col2, col2);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[2] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col3, col3);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[3] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col4, col4);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[4] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col5, col5);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[5] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col6, col6);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[6] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);

    outb = vec_packs(col7, col7);
    outb = vec_add(outb, pb_centerjsamp);
    outptr = (int *)(output_buf[7] + output_col);
    vec_ste((__vector int)outb, 0, outptr);
    vec_ste((__vector int)outb, 4, outptr);
}
mlib_status
__mlib_VideoDCT8x8Quantize_S16_S16_B12_NA(
    mlib_s16 coeffs[64],
    const mlib_s16 *block,
    const mlib_d64 qtable[64])
{
    mlib_d64 *sp = (mlib_d64 *)block;
    mlib_d64 *dp = (mlib_d64 *)coeffs;

    mlib_d64 d00, d10, d20, d30, d40, d50, d60, d70;
    mlib_d64 d01, d11, d21, d31, d41, d51, d61, d71;
    mlib_d64 t00, t10, t20, t30, t40, t50, t60, t70, t80, t90;
    mlib_d64 t01, t11, t21, t31, t41, t51, t61, t71, t81, t91;
    mlib_d64 r00, r10, r20, r30, r40, r50, r60, r70;
    mlib_d64 r01, r11, r21, r31, r41, r51, r61, r71;
    mlib_f32 FCOS, c17, c26, c35, c_4;
    mlib_s32 mask;
    mlib_d64 w_const = vis_to_double_dup(0x4000);

    if (block == NULL || coeffs == NULL)
        return (MLIB_FAILURE);

    if (!(((mlib_addr)block | (mlib_addr)coeffs) & 7)) {
        return (__mlib_VideoDCT8x8Quantize_S16_S16_B12(coeffs,
                block, qtable));
    }

    vis_write_gsr(1 << 3);
    /*
     * first stage
     */

    LOAD_DATA_GE_INTER1;

    TRANSPOSE(d00, d20, d40, d60, r00, r10, r20, r30);
    TRANSPOSE(d10, d30, d50, d70, r40, r50, r60, r70);
    LOADCONSTS4_12;

    PREPARE_DATA_INTER(0);

    LOAD_DATA_GE_INTER2;
    TRANSPOSE(d01, d21, d41, d61, r01, r11, r21, r31);

    COMPUTING_DATA(0);

    TRANSPOSE(d11, d31, d51, d71, r41, r51, r61, r71);
    PREPARE_DATA_INTER(1);
    COMPUTING_DATA(1);

    /*
     * second stage
     */


    TRANSPOSE(d01, d11, d21, d31, r40, r50, r60, r70);
    TRANSPOSE(d00, d10, d20, d30, r00, r10, r20, r30);
    PREPARE_DATA_INTER(0);
    TRANSPOSE(d40, d50, d60, d70, r01, r11, r21, r31);
    COMPUTING_DATA_12(0);

    TRANSPOSE(d41, d51, d61, d71, r41, r51, r61, r71);
    ENDSCALE_12(0);


    dp = (mlib_d64 *)vis_alignaddr(coeffs, -1);
    mask = 0xFF >> ((mlib_addr)coeffs - (mlib_addr)dp);
    vis_alignaddrl((void *)coeffs, 0);

    PREPARE_DATA_INTER(1);
    COMPUTING_DATA_12(1);

    ENDSCALE_12(1);

    Quant_ST_NA(d00, d00, qtable[0]);
    Quant_ST_NA(d01, d01, qtable[1]);
    Quant_ST_NA(d10, d10, qtable[2]);
    Quant_ST_NA(d11, d11, qtable[3]);
    Quant_ST_NA(d20, d20, qtable[4]);
    Quant_ST_NA(d21, d21, qtable[5]);
    Quant_ST_NA(d30, d30, qtable[6]);
    Quant_ST_NA(d31, d31, qtable[7]);
    Quant_ST_NA(d40, d40, qtable[8]);
    Quant_ST_NA(d41, d41, qtable[9]);
    Quant_ST_NA(d50, d50, qtable[10]);
    Quant_ST_NA(d51, d51, qtable[11]);
    Quant_ST_NA(d60, d60, qtable[12]);
    Quant_ST_NA(d61, d61, qtable[13]);
    Quant_ST_NA(d70, d70, qtable[14]);
    Quant_ST_NA(d71, d71, qtable[15]);

    dp[1] = vis_faligndata(d00, d01);
    dp[2] = vis_faligndata(d01, d10);
    dp[3] = vis_faligndata(d10, d11);
    dp[4] = vis_faligndata(d11, d20);
    dp[5] = vis_faligndata(d20, d21);
    dp[6] = vis_faligndata(d21, d30);
    dp[7] = vis_faligndata(d30, d31);
    dp[8] = vis_faligndata(d31, d40);
    dp[9] = vis_faligndata(d40, d41);
    dp[10] = vis_faligndata(d41, d50);
    dp[11] = vis_faligndata(d50, d51);
    dp[12] = vis_faligndata(d51, d60);
    dp[13] = vis_faligndata(d60, d61);
    dp[14] = vis_faligndata(d61, d70);
    dp[15] = vis_faligndata(d70, d71);
    vis_pst_8(vis_faligndata(d71, d71), dp + 16, ~mask);

    if ((mlib_addr)coeffs & 7)
        vis_pst_8(vis_faligndata(d00, d00), dp, mask);

    return (MLIB_SUCCESS);
}
mlib_status
__mlib_VideoDCT8x8Quantize_S16_S16_B12(
    mlib_s16 coeffs[64],
    const mlib_s16 *block,
    const mlib_d64 qtable[64])
{
    mlib_d64 *sp = (mlib_d64 *)block;
    mlib_d64 *dp = (mlib_d64 *)coeffs;

    mlib_d64 d00, d10, d20, d30, d40, d50, d60, d70;
    mlib_d64 d01, d11, d21, d31, d41, d51, d61, d71;
    mlib_d64 t00, t10, t20, t30, t40, t50, t60, t70, t80, t90;
    mlib_d64 t01, t11, t21, t31, t41, t51, t61, t71, t81, t91;
    mlib_d64 r00, r10, r20, r30, r40, r50, r60, r70;
    mlib_d64 r01, r11, r21, r31, r41, r51, r61, r71;
    mlib_f32 FCOS, c17, c26, c35, c_4;

    vis_write_gsr(1 << 3);
    /*
     * first stage
     */

    LOAD_DATA_AA_INTER1;
    TRANSPOSE(d00, d20, d40, d60, r00, r10, r20, r30);
    TRANSPOSE(d10, d30, d50, d70, r40, r50, r60, r70);
    LOADCONSTS4_12;

    PREPARE_DATA_INTER(0);

    LOAD_DATA_AA_INTER2;
    TRANSPOSE(d01, d21, d41, d61, r01, r11, r21, r31);

    COMPUTING_DATA(0);

    TRANSPOSE(d11, d31, d51, d71, r41, r51, r61, r71);
    PREPARE_DATA_INTER(1);
    COMPUTING_DATA(1);

    /*
     * second stage
     */

    TRANSPOSE(d01, d11, d21, d31, r40, r50, r60, r70);
    TRANSPOSE(d00, d10, d20, d30, r00, r10, r20, r30);
    PREPARE_DATA_INTER(0);
    TRANSPOSE(d40, d50, d60, d70, r01, r11, r21, r31);
    COMPUTING_DATA_12(0);

    TRANSPOSE(d41, d51, d61, d71, r41, r51, r61, r71);
    ENDSCALE_12(0);

    Quant_ST(0, d00, qtable[0]);
    Quant_ST(2, d10, qtable[2]);
    Quant_ST(4, d20, qtable[4]);
    Quant_ST(6, d30, qtable[6]);
    Quant_ST(8, d40, qtable[8]);
    Quant_ST(10, d50, qtable[10]);
    Quant_ST(12, d60, qtable[12]);
    Quant_ST(14, d70, qtable[14]);


    PREPARE_DATA_INTER(1);
    COMPUTING_DATA_12(1);

    ENDSCALE_12(1);

    Quant_ST(1, d01, qtable[1]);
    Quant_ST(3, d11, qtable[3]);
    Quant_ST(5, d21, qtable[5]);
    Quant_ST(7, d31, qtable[7]);
    Quant_ST(9, d41, qtable[9]);
    Quant_ST(11, d51, qtable[11]);
    Quant_ST(13, d61, qtable[13]);
    Quant_ST(15, d71, qtable[15]);

    return (MLIB_SUCCESS);
}
Beispiel #12
0
tx_burst_sg(struct txq *txq, unsigned int segs, struct txq_elt *elt,
	    struct rte_mbuf *buf, unsigned int elts_head,
	    struct ibv_sge (*sges)[MLX5_PMD_SGE_WR_N])
{
	unsigned int sent_size = 0;
	unsigned int j;
	int linearize = 0;

	/* When there are too many segments, extra segments are
	 * linearized in the last SGE. */
	if (unlikely(segs > RTE_DIM(*sges))) {
		segs = (RTE_DIM(*sges) - 1);
		linearize = 1;
	}
	/* Update element. */
	elt->buf = buf;
	/* Register segments as SGEs. */
	for (j = 0; (j != segs); ++j) {
		struct ibv_sge *sge = &(*sges)[j];
		uint32_t lkey;

		/* Retrieve Memory Region key for this memory pool. */
		lkey = txq_mp2mr(txq, txq_mb2mp(buf));
		if (unlikely(lkey == (uint32_t)-1)) {
			/* MR does not exist. */
			DEBUG("%p: unable to get MP <-> MR association",
			      (void *)txq);
			/* Clean up TX element. */
			elt->buf = NULL;
			goto stop;
		}
		/* Update SGE. */
		sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
		if (txq->priv->vf)
			rte_prefetch0((volatile void *)
				      (uintptr_t)sge->addr);
		sge->length = DATA_LEN(buf);
		sge->lkey = lkey;
		sent_size += sge->length;
		buf = NEXT(buf);
	}
	/* If buf is not NULL here and is not going to be linearized,
	 * nb_segs is not valid. */
	assert(j == segs);
	assert((buf == NULL) || (linearize));
	/* Linearize extra segments. */
	if (linearize) {
		struct ibv_sge *sge = &(*sges)[segs];
		linear_t *linear = &(*txq->elts_linear)[elts_head];
		unsigned int size = linearize_mbuf(linear, buf);

		assert(segs == (RTE_DIM(*sges) - 1));
		if (size == 0) {
			/* Invalid packet. */
			DEBUG("%p: packet too large to be linearized.",
			      (void *)txq);
			/* Clean up TX element. */
			elt->buf = NULL;
			goto stop;
		}
		/* If MLX5_PMD_SGE_WR_N is 1, free mbuf immediately. */
		if (RTE_DIM(*sges) == 1) {
			do {
				struct rte_mbuf *next = NEXT(buf);

				rte_pktmbuf_free_seg(buf);
				buf = next;
			} while (buf != NULL);
			elt->buf = NULL;
		}
		/* Update SGE. */
		sge->addr = (uintptr_t)&(*linear)[0];
		sge->length = size;
		sge->lkey = txq->mr_linear->lkey;
		sent_size += size;
		/* Include last segment. */
		segs++;
	}
	return (struct tx_burst_sg_ret){
		.length = sent_size,
		.num = segs,
	};
stop:
	return (struct tx_burst_sg_ret){
		.length = -1,
		.num = -1,
	};
}

#endif /* MLX5_PMD_SGE_WR_N > 1 */

/**
 * DPDK callback for TX.
 *
 * @param dpdk_txq
 *   Generic pointer to TX queue structure.
 * @param[in] pkts
 *   Packets to transmit.
 * @param pkts_n
 *   Number of packets in array.
 *
 * @return
 *   Number of packets successfully transmitted (<= pkts_n).
 */
uint16_t
mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
	struct txq *txq = (struct txq *)dpdk_txq;
	unsigned int elts_head = txq->elts_head;
	const unsigned int elts_n = txq->elts_n;
	unsigned int elts_comp_cd = txq->elts_comp_cd;
	unsigned int elts_comp = 0;
	unsigned int i;
	unsigned int max;
	int err;
	struct rte_mbuf *buf = pkts[0];

	assert(elts_comp_cd != 0);
	/* Prefetch first packet cacheline. */
	rte_prefetch0(buf);
	txq_complete(txq);
	max = (elts_n - (elts_head - txq->elts_tail));
	if (max > elts_n)
		max -= elts_n;
	assert(max >= 1);
	assert(max <= elts_n);
	/* Always leave one free entry in the ring. */
	--max;
	if (max == 0)
		return 0;
	if (max > pkts_n)
		max = pkts_n;
	for (i = 0; (i != max); ++i) {
		struct rte_mbuf *buf_next = pkts[i + 1];
		unsigned int elts_head_next =
			(((elts_head + 1) == elts_n) ? 0 : elts_head + 1);
		struct txq_elt *elt = &(*txq->elts)[elts_head];
		unsigned int segs = NB_SEGS(buf);
#ifdef MLX5_PMD_SOFT_COUNTERS
		unsigned int sent_size = 0;
#endif
		uint32_t send_flags = 0;
#ifdef HAVE_VERBS_VLAN_INSERTION
		int insert_vlan = 0;
#endif /* HAVE_VERBS_VLAN_INSERTION */

		if (i + 1 < max)
			rte_prefetch0(buf_next);
		/* Request TX completion. */
		if (unlikely(--elts_comp_cd == 0)) {
			elts_comp_cd = txq->elts_comp_cd_init;
			++elts_comp;
			send_flags |= IBV_EXP_QP_BURST_SIGNALED;
		}
		/* Should we enable HW CKSUM offload */
		if (buf->ol_flags &
		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
			send_flags |= IBV_EXP_QP_BURST_IP_CSUM;
			/* HW does not support checksum offloads at arbitrary
			 * offsets but automatically recognizes the packet
			 * type. For inner L3/L4 checksums, only VXLAN (UDP)
			 * tunnels are currently supported. */
			if (RTE_ETH_IS_TUNNEL_PKT(buf->packet_type))
				send_flags |= IBV_EXP_QP_BURST_TUNNEL;
		}
		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
#ifdef HAVE_VERBS_VLAN_INSERTION
			if (!txq->priv->mps)
				insert_vlan = 1;
			else
#endif /* HAVE_VERBS_VLAN_INSERTION */
			{
				err = insert_vlan_sw(buf);
				if (unlikely(err))
					goto stop;
			}
		}
		if (likely(segs == 1)) {
			uintptr_t addr;
			uint32_t length;
			uint32_t lkey;
			uintptr_t buf_next_addr;

			/* Retrieve buffer information. */
			addr = rte_pktmbuf_mtod(buf, uintptr_t);
			length = DATA_LEN(buf);
			/* Update element. */
			elt->buf = buf;
			if (txq->priv->vf)
				rte_prefetch0((volatile void *)
					      (uintptr_t)addr);
			/* Prefetch next buffer data. */
			if (i + 1 < max) {
				buf_next_addr =
					rte_pktmbuf_mtod(buf_next, uintptr_t);
				rte_prefetch0((volatile void *)
					      (uintptr_t)buf_next_addr);
			}
			/* Put packet into send queue. */
#if MLX5_PMD_MAX_INLINE > 0
			if (length <= txq->max_inline) {
#ifdef HAVE_VERBS_VLAN_INSERTION
				if (insert_vlan)
					err = txq->send_pending_inline_vlan
						(txq->qp,
						 (void *)addr,
						 length,
						 send_flags,
						 &buf->vlan_tci);
				else
#endif /* HAVE_VERBS_VLAN_INSERTION */
					err = txq->send_pending_inline
						(txq->qp,
						 (void *)addr,
						 length,
						 send_flags);
			} else
#endif
			{
				/* Retrieve Memory Region key for this
				 * memory pool. */
				lkey = txq_mp2mr(txq, txq_mb2mp(buf));
				if (unlikely(lkey == (uint32_t)-1)) {
					/* MR does not exist. */
					DEBUG("%p: unable to get MP <-> MR"
					      " association", (void *)txq);
					/* Clean up TX element. */
					elt->buf = NULL;
					goto stop;
				}
#ifdef HAVE_VERBS_VLAN_INSERTION
				if (insert_vlan)
					err = txq->send_pending_vlan
						(txq->qp,
						 addr,
						 length,
						 lkey,
						 send_flags,
						 &buf->vlan_tci);
				else
#endif /* HAVE_VERBS_VLAN_INSERTION */
					err = txq->send_pending
						(txq->qp,
						 addr,
						 length,
						 lkey,
						 send_flags);
			}
			if (unlikely(err))
				goto stop;
#ifdef MLX5_PMD_SOFT_COUNTERS
			sent_size += length;
#endif
		} else {
#if MLX5_PMD_SGE_WR_N > 1
			struct ibv_sge sges[MLX5_PMD_SGE_WR_N];
			struct tx_burst_sg_ret ret;

			ret = tx_burst_sg(txq, segs, elt, buf, elts_head,
					  &sges);
			if (ret.length == (unsigned int)-1)
				goto stop;
			/* Put SG list into send queue. */
#ifdef HAVE_VERBS_VLAN_INSERTION
			if (insert_vlan)
				err = txq->send_pending_sg_list_vlan
					(txq->qp,
					 sges,
					 ret.num,
					 send_flags,
					 &buf->vlan_tci);
			else
#endif /* HAVE_VERBS_VLAN_INSERTION */
				err = txq->send_pending_sg_list
					(txq->qp,
					 sges,
					 ret.num,
					 send_flags);
			if (unlikely(err))
				goto stop;
#ifdef MLX5_PMD_SOFT_COUNTERS
			sent_size += ret.length;
#endif
#else /* MLX5_PMD_SGE_WR_N > 1 */
			DEBUG("%p: TX scattered buffers support not"
			      " compiled in", (void *)txq);
			goto stop;
#endif /* MLX5_PMD_SGE_WR_N > 1 */
		}
		elts_head = elts_head_next;
		buf = buf_next;
#ifdef MLX5_PMD_SOFT_COUNTERS
		/* Increment sent bytes counter. */
		txq->stats.obytes += sent_size;
#endif
	}
stop:
	/* Take a shortcut if nothing must be sent. */
	if (unlikely(i == 0))
		return 0;
#ifdef MLX5_PMD_SOFT_COUNTERS
	/* Increment sent packets counter. */
	txq->stats.opackets += i;
#endif
	/* Ring QP doorbell. */
	err = txq->send_flush(txq->qp);
	if (unlikely(err)) {
		/* A nonzero value is not supposed to be returned.
		 * Nothing can be done about it. */
		DEBUG("%p: send_flush() failed with error %d",
		      (void *)txq, err);
	}
	txq->elts_head = elts_head;
	txq->elts_comp += elts_comp;
	txq->elts_comp_cd = elts_comp_cd;
	return i;
}

/**
 * Translate RX completion flags to packet type.
 *
 * @param flags
 *   RX completion flags returned by poll_length_flags().
 *
 * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
 *
 * @return
 *   Packet type for struct rte_mbuf.
 */
static inline uint32_t
rxq_cq_to_pkt_type(uint32_t flags)
{
	uint32_t pkt_type;

	if (flags & IBV_EXP_CQ_RX_TUNNEL_PACKET)
		pkt_type =
			TRANSPOSE(flags,
				  IBV_EXP_CQ_RX_OUTER_IPV4_PACKET,
				  RTE_PTYPE_L3_IPV4) |
			TRANSPOSE(flags,
				  IBV_EXP_CQ_RX_OUTER_IPV6_PACKET,
				  RTE_PTYPE_L3_IPV6) |
			TRANSPOSE(flags,
				  IBV_EXP_CQ_RX_IPV4_PACKET,
				  RTE_PTYPE_INNER_L3_IPV4) |
			TRANSPOSE(flags,
				  IBV_EXP_CQ_RX_IPV6_PACKET,
				  RTE_PTYPE_INNER_L3_IPV6);
	else
		pkt_type =
			TRANSPOSE(flags,
				  IBV_EXP_CQ_RX_IPV4_PACKET,
				  RTE_PTYPE_L3_IPV4) |
			TRANSPOSE(flags,
				  IBV_EXP_CQ_RX_IPV6_PACKET,
				  RTE_PTYPE_L3_IPV6);
	return pkt_type;
}
void
jsimd_fdct_ifast_altivec (DCTELEM *data)
{
  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
    col0, col1, col2, col3, col4, col5, col6, col7,
    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
    z1, z2, z3, z4, z5, z11, z13,
    out0, out1, out2, out3, out4, out5, out6, out7;

  /* Constants */
  __vector short pw_zero = { __8X(0) },
    pw_0382 = { __8X(F_0_382 << CONST_SHIFT) },
    pw_0541 = { __8X(F_0_541 << CONST_SHIFT) },
    pw_0707 = { __8X(F_0_707 << CONST_SHIFT) },
    pw_1306 = { __8X(F_1_306 << CONST_SHIFT) };
  __vector unsigned short
    pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) };

  /* Pass 1: process rows */

  row0 = vec_ld(0, data);
  row1 = vec_ld(16, data);
  row2 = vec_ld(32, data);
  row3 = vec_ld(48, data);
  row4 = vec_ld(64, data);
  row5 = vec_ld(80, data);
  row6 = vec_ld(96, data);
  row7 = vec_ld(112, data);

  TRANSPOSE(row, col);

  tmp0 = vec_add(col0, col7);
  tmp7 = vec_sub(col0, col7);
  tmp1 = vec_add(col1, col6);
  tmp6 = vec_sub(col1, col6);
  tmp2 = vec_add(col2, col5);
  tmp5 = vec_sub(col2, col5);
  tmp3 = vec_add(col3, col4);
  tmp4 = vec_sub(col3, col4);

  DO_FDCT();

  /* Pass 2: process columns */

  TRANSPOSE(out, row);

  tmp0 = vec_add(row0, row7);
  tmp7 = vec_sub(row0, row7);
  tmp1 = vec_add(row1, row6);
  tmp6 = vec_sub(row1, row6);
  tmp2 = vec_add(row2, row5);
  tmp5 = vec_sub(row2, row5);
  tmp3 = vec_add(row3, row4);
  tmp4 = vec_sub(row3, row4);

  DO_FDCT();

  vec_st(out0, 0, data);
  vec_st(out1, 16, data);
  vec_st(out2, 32, data);
  vec_st(out3, 48, data);
  vec_st(out4, 64, data);
  vec_st(out5, 80, data);
  vec_st(out6, 96, data);
  vec_st(out7, 112, data);
}