Example #1
0
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
	BLASLONG i;
	BLASLONG ix,iy;
	BLASLONG jx,jy;
	BLASLONG j;
	BLASLONG j1;
	BLASLONG j2;
	BLASLONG m2;
	FLOAT temp1;
	FLOAT temp2;
	FLOAT *xp, *yp;
	FLOAT *a0,*a1,*a2,*a3;
	FLOAT at0,at1,at2,at3;
	FLOAT tmp1[4];
	FLOAT tmp2[4];

#if 0
	if( m != offset )
		printf("Symv_U: m=%d offset=%d\n",m,offset);
#endif

	BLASLONG m1 = m - offset;
	BLASLONG mrange = m -m1;

	if ( (inc_x!=1) || (inc_y!=1) || (mrange<16) )
	{

		jx = m1 * inc_x;
		jy = m1 * inc_y;

		for (j=m1; j<m; j++)
		{
			temp1 = alpha * x[jx];
			temp2 = 0.0;
			iy = 0;
			ix = 0;
			for (i=0; i<j; i++)
			{
				y[iy] += temp1 * a[j*lda+i];
				temp2 += a[j*lda+i] * x[ix];
				ix += inc_x;
				iy += inc_y;
			
			}
			y[jy] += temp1 * a[j*lda+j] + alpha * temp2;
			jx    += inc_x;
			jy    += inc_y;
		}
		return(0);
	}

	xp = x;
	yp = y;

	m2 = m - ( mrange % 4 );

	for (j=m1; j<m2; j+=4)
	{
		tmp1[0] = alpha * xp[j];
		tmp1[1] = alpha * xp[j+1];
		tmp1[2] = alpha * xp[j+2];
		tmp1[3] = alpha * xp[j+3];
		tmp2[0] = 0.0;
		tmp2[1] = 0.0;
		tmp2[2] = 0.0;
		tmp2[3] = 0.0;
		a0    = &a[j*lda];
		a1    = a0+lda;
		a2    = a1+lda;
		a3    = a2+lda;
		j1 = (j/8)*8;		
		if ( j1 )
			dsymv_kernel_4x4(j1, a0, a1, a2, a3, xp, yp, tmp1, tmp2);
		if ( j1 < j )
			dsymv_kernel_1x4(j1, j,  a0, a1, a2, a3, xp, yp, tmp1, tmp2);

		j2 = 0;
		for ( j1 = j ; j1 < j+4 ; j1++ )
		{
			temp1 = tmp1[j2];
			temp2 = tmp2[j2];
			a0    = &a[j1*lda];
			for ( i=j ; i<j1; i++ )
			{
				yp[i] += temp1 * a0[i];	
				temp2 += a0[i] * xp[i];
				
			}
			y[j1] += temp1 * a0[j1] + alpha * temp2;
			j2++;

		}

	}

	for ( ; j<m; j++)
	{
		temp1 = alpha * xp[j];
		temp2 = 0.0;
		a0    = &a[j*lda];
		FLOAT at0;
		j1 = (j/8)*8;		

		if ( j1 )
			dsymv_kernel_8x1(j1, a0, xp, yp, &temp1, &temp2);

		for (i=j1 ; i<j; i++)
		{
			at0     = a0[i];
			yp[i] += temp1 * at0;
			temp2 += at0 * xp[i];
			
		}

		yp[j] += temp1 * a0[j] + alpha * temp2;
	}

	return(0);
	

}
Example #2
0
int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
{
	BLASLONG i;
	BLASLONG ix,iy;
	BLASLONG jx,jy;
	BLASLONG j;
	FLOAT temp1;
	FLOAT temp2;
	FLOAT tmp1[4];
	FLOAT tmp2[4];
	FLOAT *ap[4];

#if 0
	if ( m != offset )
		printf("Symv_L: m=%d offset=%d\n",m,offset);
#endif


	if ( (inc_x != 1) || (inc_y != 1) )
	{

		jx = 0;
		jy = 0;

		for (j=0; j<offset; j++)
		{
			temp1 = alpha * x[jx];
			temp2 = 0.0;
			y[jy] += temp1 * a[j*lda+j];
			iy = jy;
			ix = jx;
			for (i=j+1; i<m; i++)
			{
				ix += inc_x;
				iy += inc_y;
				y[iy] += temp1 * a[j*lda+i];
				temp2 += a[j*lda+i] * x[ix];
			
			}
			y[jy] += alpha * temp2;
			jx    += inc_x;
			jy    += inc_y;
		}
		return(0);
	}

	BLASLONG offset1 = (offset/4)*4;

	for (j=0; j<offset1; j+=4)
	{
		tmp1[0] = alpha * x[j];
		tmp1[1] = alpha * x[j+1];
		tmp1[2] = alpha * x[j+2];
		tmp1[3] = alpha * x[j+3];
		tmp2[0] = 0.0;
		tmp2[1] = 0.0;
		tmp2[2] = 0.0;
		tmp2[3] = 0.0;
		ap[0]   = &a[j*lda];
		ap[1]   = ap[0] + lda;
		ap[2]   = ap[1] + lda;
		ap[3]   = ap[2] + lda;
		y[j]   += tmp1[0] * ap[0][j];
		y[j+1] += tmp1[1] * ap[1][j+1];
		y[j+2] += tmp1[2] * ap[2][j+2];
		y[j+3] += tmp1[3] * ap[3][j+3];
		BLASLONG from = j+1;
		if ( m - from >=12 )
		{
			BLASLONG m2 = (m/4)*4;
			for (i=j+1; i<j+4; i++)
			{
				y[i] += tmp1[0] * ap[0][i];
				tmp2[0] += ap[0][i] * x[i];
			}

			for (i=j+2; i<j+4; i++)
			{
				y[i] += tmp1[1] * ap[1][i];
				tmp2[1] += ap[1][i] * x[i];
			}

			for (i=j+3; i<j+4; i++)
			{
				y[i] += tmp1[2] * ap[2][i];
				tmp2[2] += ap[2][i] * x[i];
			}

			if ( m2 > j+4 )
				dsymv_kernel_4x4(j+4,m2,ap,x,y,tmp1,tmp2);


			for (i=m2; i<m; i++)
			{
				y[i] += tmp1[0] * ap[0][i];
				tmp2[0] += ap[0][i] * x[i];

				y[i] += tmp1[1] * ap[1][i];
				tmp2[1] += ap[1][i] * x[i];

				y[i] += tmp1[2] * ap[2][i];
				tmp2[2] += ap[2][i] * x[i];

				y[i] += tmp1[3] * ap[3][i];
				tmp2[3] += ap[3][i] * x[i];

			}


		}
		else
		{

			for (i=j+1; i<j+4; i++)
			{
				y[i] += tmp1[0] * ap[0][i];
				tmp2[0] += ap[0][i] * x[i];
			}

			for (i=j+2; i<j+4; i++)
			{
				y[i] += tmp1[1] * ap[1][i];
				tmp2[1] += ap[1][i] * x[i];
			}

			for (i=j+3; i<j+4; i++)
			{
				y[i] += tmp1[2] * ap[2][i];
				tmp2[2] += ap[2][i] * x[i];
			}

			for (i=j+4; i<m; i++)
			{
				y[i] += tmp1[0] * ap[0][i];
				tmp2[0] += ap[0][i] * x[i];

				y[i] += tmp1[1] * ap[1][i];
				tmp2[1] += ap[1][i] * x[i];

				y[i] += tmp1[2] * ap[2][i];
				tmp2[2] += ap[2][i] * x[i];

				y[i] += tmp1[3] * ap[3][i];
				tmp2[3] += ap[3][i] * x[i];

			}

		}
		y[j]   += alpha * tmp2[0];
		y[j+1] += alpha * tmp2[1];
		y[j+2] += alpha * tmp2[2];
		y[j+3] += alpha * tmp2[3];
	}


	for (j=offset1; j<offset; j++)
	{
		temp1 = alpha * x[j];
		temp2 = 0.0;
		y[j] += temp1 * a[j*lda+j];
		BLASLONG from = j+1;
		if ( m - from >=8 )
		{
			BLASLONG j1 = ((from + 4)/4)*4;
			BLASLONG j2 = (m/4)*4;
			for (i=from; i<j1; i++)
			{
				y[i] += temp1 * a[j*lda+i];
				temp2 += a[j*lda+i] * x[i];
			
			}

			for (i=j1; i<j2; i++)
			{
				y[i] += temp1 * a[j*lda+i];
				temp2 += a[j*lda+i] * x[i];
			
			}

			for (i=j2; i<m; i++)
			{
				y[i] += temp1 * a[j*lda+i];
				temp2 += a[j*lda+i] * x[i];
			
			}

		}
		else
		{
			for (i=from; i<m; i++)
			{
				y[i] += temp1 * a[j*lda+i];
				temp2 += a[j*lda+i] * x[i];
			
			}

		}
		y[j] += alpha * temp2;
	}
	return(0);
}