Exemplo n.º 1
0
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
	BLASLONG i;
	BLASLONG j;
	FLOAT *a_ptr;
	FLOAT *x_ptr;
	FLOAT *y_ptr;
	FLOAT *ap[8];
	BLASLONG n1;
	BLASLONG m1;
	BLASLONG m2;
	BLASLONG m3;
	BLASLONG n2;
	BLASLONG lda4;
	FLOAT ybuffer[8],*xbuffer;
	FLOAT alpha[2];

        if ( m < 1 ) return(0);
        if ( n < 1 ) return(0);

        inc_x <<= 1;
        inc_y <<= 1;
        lda   <<= 1;
	lda4    = lda << 2;

	xbuffer = buffer;
	
	n1 = n  >> 2 ;
	n2 = n  &  3 ;
	
	m3 = m & 3 ;
	m1 = m - m3;
	m2 = (m & (NBMAX-1)) - m3 ;
	
	alpha[0] = alpha_r;
	alpha[1] = alpha_i;

	BLASLONG NB = NBMAX;

	while ( NB == NBMAX )
	{
		
		m1 -= NB;
		if ( m1 < 0)
		{
			if ( m2 == 0 ) break;	
			NB = m2;
		}
		
		y_ptr = y;
		a_ptr = a;
		x_ptr = x;
		ap[0] = a_ptr;
		ap[1] = a_ptr + lda;
		ap[2] = ap[1] + lda;
		ap[3] = ap[2] + lda;
		if ( inc_x != 2 )
			copy_x(NB,x_ptr,xbuffer,inc_x);
		else
			xbuffer = x_ptr;
		
		if ( inc_y == 2 )
		{

			for( i = 0; i < n1 ; i++)
			{
				zgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha);
				ap[0] += lda4;
				ap[1] += lda4;
				ap[2] += lda4;
				ap[3] += lda4;
				a_ptr += lda4;
				y_ptr += 8;
				
			}

			if ( n2 & 2 )
			{
				zgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha);
				a_ptr += lda * 2;
				y_ptr += 4;

			}

			if ( n2 & 1 )
			{
				zgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha);
				a_ptr += lda;
				y_ptr += 2;

			}

		}
		else
		{

			for( i = 0; i < n1 ; i++)
			{
				memset(ybuffer,0,64);
				zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha);
				ap[0] += lda4;
				ap[1] += lda4;
				ap[2] += lda4;
				ap[3] += lda4;
				a_ptr += lda4;

				y_ptr[0] += ybuffer[0];
				y_ptr[1] += ybuffer[1];
				y_ptr  += inc_y;
				y_ptr[0] += ybuffer[2];
				y_ptr[1] += ybuffer[3];
				y_ptr  += inc_y;
				y_ptr[0] += ybuffer[4];
				y_ptr[1] += ybuffer[5];
				y_ptr  += inc_y;
				y_ptr[0] += ybuffer[6];
				y_ptr[1] += ybuffer[7];
				y_ptr  += inc_y;

			}

			for( i = 0; i < n2 ; i++)
			{
				memset(ybuffer,0,64);
				zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha);
				a_ptr += lda;
				y_ptr[0] += ybuffer[0];
				y_ptr[1] += ybuffer[1];
				y_ptr  += inc_y;

			}

		}
		a += 2 * NB;
		x += NB * inc_x;	
	}



	if ( m3 == 0 ) return(0);

        x_ptr = x;
        j=0;
        a_ptr = a;
        y_ptr = y;

	if ( m3 == 3 )
	{

                FLOAT temp_r ;
                FLOAT temp_i ;
		FLOAT x0 = x_ptr[0];
		FLOAT x1 = x_ptr[1];
		x_ptr += inc_x;
		FLOAT x2 = x_ptr[0];
		FLOAT x3 = x_ptr[1];
		x_ptr += inc_x;
		FLOAT x4 = x_ptr[0];
		FLOAT x5 = x_ptr[1];
	        while ( j < n)
        	{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
                       	temp_r  = a_ptr[0] * x0 - a_ptr[1] * x1; 
                       	temp_i  = a_ptr[0] * x1 + a_ptr[1] * x0; 
                       	temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; 
                       	temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; 
                       	temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;
                       	temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;
#else

                       	temp_r  = a_ptr[0] * x0 + a_ptr[1] * x1; 
                       	temp_i  = a_ptr[0] * x1 - a_ptr[1] * x0; 
                       	temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; 
                       	temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; 
                       	temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;
                       	temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;
#endif

#if !defined(XCONJ) 
                	y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
                	y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
#else
                	y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
                	y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif

                	a_ptr += lda;
                	y_ptr += inc_y;
                	j++;
        	}
        	return(0);
	}


	if ( m3 == 2 )
	{

                FLOAT temp_r ;
                FLOAT temp_i ;
                FLOAT temp_r1 ;
                FLOAT temp_i1 ;
		FLOAT x0 = x_ptr[0];
		FLOAT x1 = x_ptr[1];
		x_ptr += inc_x;
		FLOAT x2 = x_ptr[0];
		FLOAT x3 = x_ptr[1];
		FLOAT ar = alpha[0];
		FLOAT ai = alpha[1];

	        while ( j < ( n & -2 ))
        	{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
                       	temp_r  = a_ptr[0] * x0 - a_ptr[1] * x1; 
                       	temp_i  = a_ptr[0] * x1 + a_ptr[1] * x0; 
                       	temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; 
                       	temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; 
                	a_ptr += lda;
                       	temp_r1  = a_ptr[0] * x0 - a_ptr[1] * x1; 
                       	temp_i1  = a_ptr[0] * x1 + a_ptr[1] * x0; 
                       	temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; 
                       	temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; 
#else

                       	temp_r  = a_ptr[0] * x0 + a_ptr[1] * x1; 
                       	temp_i  = a_ptr[0] * x1 - a_ptr[1] * x0; 
                       	temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; 
                       	temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; 
                	a_ptr += lda;
                       	temp_r1  = a_ptr[0] * x0 + a_ptr[1] * x1; 
                       	temp_i1  = a_ptr[0] * x1 - a_ptr[1] * x0; 
                       	temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; 
                       	temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; 
#endif

#if !defined(XCONJ) 
                	y_ptr[0] += ar * temp_r - ai * temp_i;
                	y_ptr[1] += ar * temp_i + ai * temp_r;
                	y_ptr += inc_y;
                	y_ptr[0] += ar * temp_r1 - ai * temp_i1;
                	y_ptr[1] += ar * temp_i1 + ai * temp_r1;
#else
                	y_ptr[0] += ar * temp_r + ai * temp_i;
                	y_ptr[1] -= ar * temp_i - ai * temp_r;
                	y_ptr += inc_y;
                	y_ptr[0] += ar * temp_r1 + ai * temp_i1;
                	y_ptr[1] -= ar * temp_i1 - ai * temp_r1;
#endif

                	a_ptr += lda;
                	y_ptr += inc_y;
                	j+=2;
        	}


	        while ( j < n)
        	{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
                       	temp_r  = a_ptr[0] * x0 - a_ptr[1] * x1; 
                       	temp_i  = a_ptr[0] * x1 + a_ptr[1] * x0; 
                       	temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; 
                       	temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; 
#else

                       	temp_r  = a_ptr[0] * x0 + a_ptr[1] * x1; 
                       	temp_i  = a_ptr[0] * x1 - a_ptr[1] * x0; 
                       	temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; 
                       	temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; 
#endif

#if !defined(XCONJ) 
                	y_ptr[0] += ar * temp_r - ai * temp_i;
                	y_ptr[1] += ar * temp_i + ai * temp_r;
#else
                	y_ptr[0] += ar * temp_r + ai * temp_i;
                	y_ptr[1] -= ar * temp_i - ai * temp_r;
#endif

                	a_ptr += lda;
                	y_ptr += inc_y;
                	j++;
        	}

        	return(0);
	}


	if ( m3 == 1 )
	{

                FLOAT temp_r ;
                FLOAT temp_i ;
                FLOAT temp_r1 ;
                FLOAT temp_i1 ;
		FLOAT x0 = x_ptr[0];
		FLOAT x1 = x_ptr[1];
		FLOAT ar = alpha[0];
		FLOAT ai = alpha[1];

	        while ( j < ( n & -2 ))
        	{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
                       	temp_r  = a_ptr[0] * x0 - a_ptr[1] * x1; 
                       	temp_i  = a_ptr[0] * x1 + a_ptr[1] * x0; 
                	a_ptr += lda;
                       	temp_r1  = a_ptr[0] * x0 - a_ptr[1] * x1; 
                       	temp_i1  = a_ptr[0] * x1 + a_ptr[1] * x0; 
#else

                       	temp_r  = a_ptr[0] * x0 + a_ptr[1] * x1; 
                       	temp_i  = a_ptr[0] * x1 - a_ptr[1] * x0; 
                	a_ptr += lda;
                       	temp_r1  = a_ptr[0] * x0 + a_ptr[1] * x1; 
                       	temp_i1  = a_ptr[0] * x1 - a_ptr[1] * x0; 
#endif

#if !defined(XCONJ) 
                	y_ptr[0] += ar * temp_r - ai * temp_i;
                	y_ptr[1] += ar * temp_i + ai * temp_r;
                	y_ptr += inc_y;
                	y_ptr[0] += ar * temp_r1 - ai * temp_i1;
                	y_ptr[1] += ar * temp_i1 + ai * temp_r1;
#else
                	y_ptr[0] += ar * temp_r + ai * temp_i;
                	y_ptr[1] -= ar * temp_i - ai * temp_r;
                	y_ptr += inc_y;
                	y_ptr[0] += ar * temp_r1 + ai * temp_i1;
                	y_ptr[1] -= ar * temp_i1 - ai * temp_r1;
#endif

                	a_ptr += lda;
                	y_ptr += inc_y;
                	j+=2;
        	}

	        while ( j < n)
        	{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
                       	temp_r  = a_ptr[0] * x0 - a_ptr[1] * x1; 
                       	temp_i  = a_ptr[0] * x1 + a_ptr[1] * x0; 
#else

                       	temp_r  = a_ptr[0] * x0 + a_ptr[1] * x1; 
                       	temp_i  = a_ptr[0] * x1 - a_ptr[1] * x0; 
#endif

#if !defined(XCONJ) 
                	y_ptr[0] += ar * temp_r - ai * temp_i;
                	y_ptr[1] += ar * temp_i + ai * temp_r;
#else
                	y_ptr[0] += ar * temp_r + ai * temp_i;
                	y_ptr[1] -= ar * temp_i - ai * temp_r;
#endif

                	a_ptr += lda;
                	y_ptr += inc_y;
                	j++;
        	}
        	return(0);
	}

	return(0);


}
Exemplo n.º 2
0
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
	BLASLONG i;
	FLOAT *a_ptr;
	FLOAT *x_ptr;
	FLOAT *y_ptr;
	FLOAT *ap[4];
	BLASLONG n1;
	BLASLONG m1;
	BLASLONG m2;
	BLASLONG m3;
	BLASLONG n2;
	BLASLONG lda4;
	FLOAT xbuffer[8],*ybuffer;


#if 0
printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y);
#endif

	if ( m < 1 ) return(0);
	if ( n < 1 ) return(0);

	ybuffer = buffer;
	
	inc_x *= 2;
	inc_y *= 2;
	lda   *= 2;
	lda4  = 4 * lda;

	n1 = n / 4 ;
	n2 = n % 4 ;
	
	m3 = m % 4;
	m1 = m - ( m % 4 );
	m2 = (m % NBMAX) - (m % 4) ;
	
	y_ptr = y;

	BLASLONG NB = NBMAX;

	while ( NB == NBMAX )
	{
		
		m1 -= NB;
		if ( m1 < 0)
		{
			if ( m2 == 0 ) break;	
			NB = m2;
		}
		
		a_ptr = a;
		ap[0] = a_ptr;
		ap[1] = a_ptr + lda;
		ap[2] = ap[1] + lda;
		ap[3] = ap[2] + lda;
		x_ptr = x;
		//zero_y(NB,ybuffer);
		memset(ybuffer,0,NB*16);

		if ( inc_x == 2 )
		{

			for( i = 0; i < n1 ; i++)
			{
				zgemv_kernel_4x4(NB,ap,x_ptr,ybuffer);
				ap[0] += lda4;
				ap[1] += lda4;
				ap[2] += lda4;
				ap[3] += lda4;
				a_ptr += lda4;
				x_ptr += 8;	
			}

			if ( n2 & 2 )
			{
				zgemv_kernel_4x2(NB,ap,x_ptr,ybuffer);
				x_ptr += 4;	
				a_ptr += 2 * lda;

			}

			if ( n2 & 1 )
			{
				zgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer);
				/* x_ptr += 2;	
				a_ptr += lda; */

			}
		}
		else
		{

			for( i = 0; i < n1 ; i++)
			{

				xbuffer[0] = x_ptr[0];
				xbuffer[1] = x_ptr[1];
				x_ptr += inc_x;	
				xbuffer[2] = x_ptr[0];
				xbuffer[3] = x_ptr[1];
				x_ptr += inc_x;	
				xbuffer[4] = x_ptr[0];
				xbuffer[5] = x_ptr[1];
				x_ptr += inc_x;	
				xbuffer[6] = x_ptr[0];
				xbuffer[7] = x_ptr[1];
				x_ptr += inc_x;	

				zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer);
				ap[0] += lda4;
				ap[1] += lda4;
				ap[2] += lda4;
				ap[3] += lda4;
				a_ptr += lda4;
			}

			for( i = 0; i < n2 ; i++)
			{
				xbuffer[0] = x_ptr[0];
				xbuffer[1] = x_ptr[1];
				x_ptr += inc_x;	
				zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
				a_ptr += 1 * lda;

			}

		}

		add_y(NB,ybuffer,y_ptr,inc_y,alpha_r,alpha_i);
		a     += 2 * NB;
		y_ptr += NB * inc_y;
	}

	if ( m3 == 0 ) return(0);

	if ( m3 == 1 )
	{
		a_ptr = a;
		x_ptr = x;
		FLOAT temp_r = 0.0;
		FLOAT temp_i = 0.0;

		if ( lda == 2 && inc_x == 2 )
		{


			for( i=0 ; i < (n & -2); i+=2 )
			{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
				temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
				temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
				temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
				temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
#else
				temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
				temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
				temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
				temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
#endif

				a_ptr += 4;
				x_ptr += 4;
			}



			for( ; i < n; i++ )
			{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
				temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
				temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
#else
				temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
				temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
#endif

				a_ptr += 2;
				x_ptr += 2;
			}


		}
		else
		{

			for( i = 0; i < n; i++ )
			{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
				temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
				temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
#else
				temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
				temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
#endif

				a_ptr += lda;
				x_ptr += inc_x;
			}

		}
#if !defined(XCONJ) 
		y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
		y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
#else
		y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
		y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
		return(0);
	}

	if ( m3 == 2 )
	{
		a_ptr = a;
		x_ptr = x;
		FLOAT temp_r0 = 0.0;
		FLOAT temp_i0 = 0.0;
		FLOAT temp_r1 = 0.0;
		FLOAT temp_i1 = 0.0;

		if ( lda == 4 && inc_x == 2 )
		{

			for( i = 0; i < (n & -2); i+=2 )
			{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )

				temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
				temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
				temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
				temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];

				temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
				temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
				temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
				temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];

#else
				temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
				temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
				temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
				temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];

				temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
				temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
				temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
				temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];

#endif

				a_ptr += 8;
				x_ptr += 4;
			}


			for( ; i < n; i++ )
			{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
				temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
				temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
				temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
				temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
#else
				temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
				temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
				temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
				temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
#endif

				a_ptr += 4;
				x_ptr += 2;
			}


		}
		else
		{

			for( i=0 ; i < n; i++ )
			{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
				temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
				temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
				temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
				temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
#else
				temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
				temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
				temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
				temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
#endif

				a_ptr += lda;
				x_ptr += inc_x;
			}


		}
#if !defined(XCONJ) 
		y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
		y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
		y_ptr    += inc_y;
		y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
		y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
#else
		y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
		y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
		y_ptr    += inc_y;
		y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
		y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
#endif
		return(0);
	}


	if ( m3 == 3 )
	{
		a_ptr = a;
		x_ptr = x;
		FLOAT temp_r0 = 0.0;
		FLOAT temp_i0 = 0.0;
		FLOAT temp_r1 = 0.0;
		FLOAT temp_i1 = 0.0;
		FLOAT temp_r2 = 0.0;
		FLOAT temp_i2 = 0.0;

		if ( lda == 6 && inc_x == 2 )
		{

			for( i=0 ; i < n; i++ )
			{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
				temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
				temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
				temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
				temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
				temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
				temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
#else
				temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
				temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
				temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
				temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
				temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
				temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
#endif

				a_ptr += 6;
				x_ptr += 2;
			}


		}
		else
		{

			for( i = 0; i < n; i++ )
			{
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
				temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
				temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
				temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
				temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
				temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
				temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
#else
				temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
				temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
				temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
				temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
				temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
				temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
#endif

				a_ptr += lda;
				x_ptr += inc_x;
			}

		}
#if !defined(XCONJ) 
		y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
		y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
		y_ptr    += inc_y;
		y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
		y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
		y_ptr    += inc_y;
		y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
		y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
#else
		y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
		y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
		y_ptr    += inc_y;
		y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
		y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
		y_ptr    += inc_y;
		y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
		y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
#endif
		return(0);
	}





	return(0);
}