void Mjoin(PATL,ger1_a1_x1_yX) (const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX, const TYPE *Y, const int incY, TYPE *A, const int lda) { int i, j; const int incy = incY<<2; const TYPE *Y1= Y + incY, *Y2 = Y1 + incY, *Y3 = Y2 + incY; const TYPE *x; TYPE *A0 = A, *A1 = A + lda, *A2 = A1 + lda, *A3 = A2 + lda; const int M4 = (M>>2)<<2, N4 = (N>>2)<<2, incAn = (lda<<2) - M4; register TYPE x0, x1, x2, x3, y0, y1, y2, y3; if (M4) { for (j=N4; j; j -= 4) { y0 = *Y; Y += incy; y1 = *Y1; Y1 += incy; y2 = *Y2; Y2 += incy; y3 = *Y3; Y3 += incy; x = X; for (i=M4; i; i -= 4) { x0 = *x; x1 = x[1]; x2 = x[2]; x3 = x[3]; *A0 += y0 * x0; x += 4; *A1 += y1 * x0; *A2 += y2 * x0; *A3 += y3 * x0; A0[1] += y0 * x1; A1[1] += y1 * x1; A2[1] += y2 * x1; A3[1] += y3 * x1; A0[2] += y0 * x2; A1[2] += y1 * x2; A2[2] += y2 * x2; A3[2] += y3 * x2; A0[3] += y0 * x3; A0 += 4; A1[3] += y1 * x3; A1 += 4; A2[3] += y2 * x3; A2 += 4; A3[3] += y3 * x3; A3 += 4; } switch(M-M4) { case 1: x0 = *x; *A0 += y0 * x0; *A1 += y1 * x0; *A2 += y2 * x0; *A3 += y3 * x0; break; case 2: x0 = *x; x1 = x[1]; *A0 += y0 * x0; *A1 += y1 * x0; *A2 += y2 * x0; *A3 += y3 * x0; A0[1] += y0 * x1; A1[1] += y1 * x1; A2[1] += y2 * x1; A3[1] += y3 * x1; break; case 3: x0 = *x; x1 = x[1]; x2 = x[2]; *A0 += y0 * x0; *A1 += y1 * x0; *A2 += y2 * x0; *A3 += y3 * x0; A0[1] += y0 * x1; A1[1] += y1 * x1; A2[1] += y2 * x1; A3[1] += y3 * x1; A0[2] += y0 * x2; A1[2] += y1 * x2; A2[2] += y2 * x2; A3[2] += y3 * x2; break; } A0 += incAn; A1 += incAn; A2 += incAn; A3 += incAn; } if (N-N4) ger_Nle4(M, N-N4, X, Y, incY, A0, lda); } else ger_Mle8(M, N, X, Y, incY, A, lda); }
void Mjoin(PATL,ger1_a1_x1_yX) (const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX, const TYPE *Y, const int incY, TYPE *A, const int lda) { int i, j; const int incy = incY<<2; const TYPE *Y1= Y + incY, *Y2 = Y1 + incY, *Y3 = Y2 + incY; const TYPE *x; TYPE *A0 = A, *A1 = A + lda, *A2 = A1 + lda, *A3 = A2 + lda; const int M8 = ((M-1)>>3)<<3, mr = M-M8-1; const int N4 = (N>>2)<<2, incAn = (lda<<2) - M + 1; register TYPE m0, m1, m2, m3, x0, y0, y1, y2, y3; if (M8) { for (j=N4; j; j -= 4) { y0 = *Y; y1 = *Y1; y2 = *Y2; y3 = *Y3; x0 = *X; x = X + 1; m0 = y0 * x0; Y += incy; m1 = y1 * x0; Y1 += incy; m2 = y2 * x0; Y2 += incy; m3 = y3 * x0; Y3 += incy; for (i=M8; i; i -= 8) { x0 = *x; *A0 += m0; m0 = y0 * x0; *A1 += m1; m1 = y1 * x0; *A2 += m2; m2 = y2 * x0; *A3 += m3; m3 = y3 * x0; x0 = x[1]; A0[1] += m0; m0 = y0 * x0; A1[1] += m1; m1 = y1 * x0; A2[1] += m2; m2 = y2 * x0; A3[1] += m3; m3 = y3 * x0; x0 = x[2]; A0[2] += m0; m0 = y0 * x0; A1[2] += m1; m1 = y1 * x0; A2[2] += m2; m2 = y2 * x0; A3[2] += m3; m3 = y3 * x0; x0 = x[3]; A0[3] += m0; m0 = y0 * x0; A1[3] += m1; m1 = y1 * x0; A2[3] += m2; m2 = y2 * x0; A3[3] += m3; m3 = y3 * x0; x0 = x[4]; A0[4] += m0; m0 = y0 * x0; A1[4] += m1; m1 = y1 * x0; A2[4] += m2; m2 = y2 * x0; A3[4] += m3; m3 = y3 * x0; x0 = x[5]; A0[5] += m0; m0 = y0 * x0; A1[5] += m1; m1 = y1 * x0; A2[5] += m2; m2 = y2 * x0; A3[5] += m3; m3 = y3 * x0; x0 = x[6]; A0[6] += m0; m0 = y0 * x0; A1[6] += m1; m1 = y1 * x0; A2[6] += m2; m2 = y2 * x0; A3[6] += m3; m3 = y3 * x0; x0 = x[7]; x += 8; A0[7] += m0; m0 = y0 * x0; A0 += 8; A1[7] += m1; m1 = y1 * x0; A1 += 8; A2[7] += m2; m2 = y2 * x0; A2 += 8; A3[7] += m3; m3 = y3 * x0; A3 += 8; } if (mr) { for (i=mr; i; i--) { x0 = *x++; *A0++ += m0; m0 = y0 * x0; *A1++ += m1; m1 = y1 * x0; *A2++ += m2; m2 = y2 * x0; *A3++ += m3; m3 = y3 * x0; } } *A0 += m0; A0 += incAn; *A1 += m1; A1 += incAn; *A2 += m2; A2 += incAn; *A3 += m3; A3 += incAn; } if (N-N4) ger_Nle4(M, N-N4, X, Y, incY, A0, lda); } else ger_Mle8(M, N, X, Y, incY, A, lda); }