int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer){ BLASLONG i; FLOAT *X, *Y; X = x; Y = y; if (incx != 1) { COPY_K(m, x, incx, buffer, 1); X = buffer; } if (incy != 1) { COPY_K(m, y, incy, (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)), 1); Y = (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)); } for (i = 0; i < m; i++){ #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * X[i], Y, 1, a, 1, NULL, 0); AXPYU_K(i + 1, 0, 0, alpha_r * Y[i], X, 1, a, 1, NULL, 0); a += i + 1; #else AXPYU_K(m - i, 0, 0, alpha_r * X[i], Y + i, 1, a, 1, NULL, 0); AXPYU_K(m - i, 0, 0, alpha_r * Y[i], X + i, 1, a, 1, NULL, 0); a += m - i; #endif } return 0; }
int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *buffer){ BLASLONG i; FLOAT *X; X = x; if (incx != 1) { COPY_K(m, x, incx, buffer, 1); X = buffer; } for (i = 0; i < m; i++){ #ifndef LOWER if (X[i] != ZERO) { AXPYU_K(i + 1, 0, 0, alpha_r * X[i], X, 1, a, 1, NULL, 0); } a += i + 1; #else if (X[i] != ZERO) { AXPYU_K(m - i, 0, 0, alpha_r * X[i], X + i, 1, a, 1, NULL, 0); } a += m - i; #endif } return 0; }
int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *buffer){ BLASLONG i; FLOAT *X; X = x; if (incx != 1) { COPY_K(m, x, incx, buffer, 1); X = buffer; } for (i = 0; i < m; i++){ #ifndef LOWER if ((X[i * 2 + 0] != ZERO) && (X[i * 2 + 1] != ZERO)) { AXPYU_K(i + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], X, 1, a, 1, NULL, 0); } a += (i + 1) * 2; #else if ((X[i * 2 + 0] != ZERO) && (X[i * 2 + 1] != ZERO)) { AXPYU_K(m - i, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); } a += (m - i) * 2; #endif } return 0; }
int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i, is, min_i; FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } for (is = m; is > 0; is -= DTB_ENTRIES){ min_i = MIN(is, DTB_ENTRIES); #ifdef TRANSA if (m - is > 0){ GEMV_T(m - is, min_i, 0, dm1, a + is + (is - min_i) * lda, lda, B + is, 1, B + is - min_i, 1, gemvbuffer); } #endif for (i = 0; i < min_i; i++) { FLOAT *AA = a + (is - i - 1) + (is - i - 1) * lda; FLOAT *BB = B + (is - i - 1); #ifdef TRANSA if (i > 0) BB[0] -= DOTU_K(i, AA + 1, 1, BB + 1, 1); #endif #ifndef UNIT BB[0] /= AA[0]; #endif #ifndef TRANSA if (i < min_i - 1) AXPYU_K(min_i - i - 1, 0, 0, -BB[0], AA - (min_i - i - 1), 1, BB - (min_i - i - 1), 1, NULL, 0); #endif } #ifndef TRANSA if (is - min_i > 0){ GEMV_N(is - min_i, min_i, 0, dm1, a + (is - min_i) * lda, lda, B + is - min_i, 1, B, 1, gemvbuffer); } #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; }
int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *buffer){ BLASLONG i, is, min_i; FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } for (is = 0; is < m; is += DTB_ENTRIES){ min_i = MIN(m - is, DTB_ENTRIES); #ifndef TRANSA if (is > 0){ GEMV_N(is, min_i, 0, dp1, a + is * lda, lda, B + is, 1, B, 1, gemvbuffer); } #endif for (i = 0; i < min_i; i++) { FLOAT *AA = a + is + (i + is) * lda; FLOAT *BB = B + is; #ifndef TRANSA if (i > 0) AXPYU_K(i, 0, 0, BB[i], AA, 1, BB, 1, NULL, 0); #endif #ifndef UNIT BB[i] *= AA[i]; #endif #ifdef TRANSA if (i < min_i - 1) BB[i] += DOTU_K(min_i - i - 1, AA + i + 1, 1, BB + i + 1, 1); #endif } #ifdef TRANSA if (m - is > min_i){ GEMV_T(m - is - min_i, min_i, 0, dp1, a + is + min_i + is * lda, lda, B + is + min_i, 1, B + is, 1, gemvbuffer); } #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; }
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; BLASLONG length; if (incb != 1) { B = buffer; gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } a += (n - 1) * lda; for (i = n - 1; i >= 0; i--) { #ifdef TRANSA length = n - i - 1; if (length > k) length = k; if (length > 0) { B[i] -= DOTU_K(length, a + 1, 1, B + i + 1, 1); } #endif #ifndef UNIT #ifdef TRANSA B[i] /= a[0]; #else B[i] /= a[k]; #endif #endif #ifndef TRANSA length = i; if (length > k) length = k; if (length > 0) { AXPYU_K(length, 0, 0, - B[i], a + k - length, 1, B + i - length, 1, NULL, 0); } #endif a -= lda; } if (incb != 1) { COPY_K(n, buffer, 1, b, incb); } return 0; }
int CNAME(BLASLONG m, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *buffer){ BLASLONG i; FLOAT *X; X = x; if (incx != 1) { COPY_K(m, x, incx, buffer, 1); X = buffer; } for (i = 0; i < m; i++){ #ifndef HEMVREV #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha * X[i * 2 + 0], -alpha * X[i * 2 + 1], X, 1, a, 1, NULL, 0); a[i * 2 + 1] = ZERO; a += (i + 1) * 2; #else AXPYU_K(m - i, 0, 0, alpha * X[i * 2 + 0], -alpha * X[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); a[1] = ZERO; a += (m - i) * 2; #endif #else #ifndef LOWER AXPYC_K(i + 1, 0, 0, alpha * X[i * 2 + 0], alpha * X[i * 2 + 1], X, 1, a, 1, NULL, 0); a[i * 2 + 1] = ZERO; a += (i + 1) * 2; #else AXPYC_K(m - i, 0, 0, alpha * X[i * 2 + 0], alpha * X[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); a[1] = ZERO; a += (m - i) * 2; #endif #endif } return 0; }
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; FLOAT *B = b; BLASLONG length; if (incb != 1) { B = buffer; COPY_K(n, b, incb, buffer, 1); } for (i = 0; i < n; i++) { #ifndef TRANSA length = i; if (length > k) length = k; if (length > 0) { AXPYU_K(length, 0, 0, B[i], a + k - length, 1, B + i - length, 1, NULL, 0); } #endif #ifndef UNIT #ifndef TRANSA B[i] *= a[k]; #else B[i] *= a[0]; #endif #endif #ifdef TRANSA length = n - i - 1; if (length > k) length = k; if (length > 0) { B[i] += DOTU_K(length, a + 1, 1, B + i + 1, 1); } #endif a += lda; } if (incb != 1) { COPY_K(n, buffer, 1, b, incb); } return 0; }
int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; FLOAT *B = b; if (incb != 1) { B = buffer; COPY_K(m, b, incb, buffer, 1); } a += (m + 1) * m / 2 - 1; for (i = 0; i < m; i++) { #ifndef TRANSA if (i > 0) AXPYU_K(i, 0, 0, B[m - i - 1], a + 1, 1, B + m - i, 1, NULL, 0); #endif #ifndef UNIT B[m - i - 1] *= a[0]; #endif #ifdef TRANSA if (i < m - 1) B[m - i - 1] += DOTU_K(m - i - 1, a - (m - i - 1), 1, B, 1); #endif #ifndef TRANSA a -= (i + 2); #else a -= (m - i); #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; }
int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i, is, min_i; #if (TRANSA == 2) || (TRANSA == 4) FLOAT _Complex result; #endif #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; #endif FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } for (is = m; is > 0; is -= DTB_ENTRIES){ min_i = MIN(is, DTB_ENTRIES); #if (TRANSA == 2) || (TRANSA == 4) if (m - is > 0){ #if TRANSA == 2 GEMV_T(m - is, min_i, 0, dm1, ZERO, a + (is + (is - min_i) * lda) * COMPSIZE, lda, B + is * COMPSIZE, 1, B + (is - min_i) * COMPSIZE, 1, gemvbuffer); #else GEMV_C(m - is, min_i, 0, dm1, ZERO, a + (is + (is - min_i) * lda) * COMPSIZE, lda, B + is * COMPSIZE, 1, B + (is - min_i) * COMPSIZE, 1, gemvbuffer); #endif } #endif for (i = 0; i < min_i; i++) { FLOAT *AA = a + ((is - i - 1) + (is - i - 1) * lda) * COMPSIZE; FLOAT *BB = B + (is - i - 1) * COMPSIZE; #if (TRANSA == 2) || (TRANSA == 4) if (i > 0) { #if TRANSA == 2 result = DOTU_K(i, AA + 2, 1, BB + 2, 1); #else result = DOTC_K(i, AA + 2, 1, BB + 2, 1); #endif BB[0] -= CREAL(result); BB[1] -= CIMAG(result); } #endif #ifndef UNIT ar = AA[0]; ai = AA[1]; if (fabs(ar) >= fabs(ai)){ ratio = ai / ar; den = 1./(ar * ( 1 + ratio * ratio)); ar = den; #if TRANSA < 3 ai = -ratio * den; #else ai = ratio * den; #endif } else { ratio = ar / ai; den = 1./(ai * ( 1 + ratio * ratio)); ar = ratio * den; #if TRANSA < 3 ai = -den; #else ai = den; #endif } br = BB[0]; bi = BB[1]; BB[0] = ar*br - ai*bi; BB[1] = ar*bi + ai*br; #endif #if (TRANSA == 1) || (TRANSA == 3) if (i < min_i - 1) { #if TRANSA == 1 AXPYU_K (min_i - i - 1, 0, 0, - BB[0], -BB[1], AA - (min_i - i - 1) * COMPSIZE, 1, BB - (min_i - i - 1) * COMPSIZE, 1, NULL, 0); #else AXPYC_K(min_i - i - 1, 0, 0, - BB[0], -BB[1], AA - (min_i - i - 1) * COMPSIZE, 1, BB - (min_i - i - 1) * COMPSIZE, 1, NULL, 0); #endif } #endif } #if (TRANSA == 1) || (TRANSA == 3) if (is - min_i > 0){ #if TRANSA == 1 GEMV_N(is - min_i, min_i, 0, dm1, ZERO, a + (is - min_i) * lda * COMPSIZE, lda, B + (is - min_i) * COMPSIZE, 1, B, 1, gemvbuffer); #else GEMV_R(is - min_i, min_i, 0, dm1, ZERO, a + (is - min_i) * lda * COMPSIZE, lda, B + (is - min_i) * COMPSIZE, 1, B, 1, gemvbuffer); #endif } #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; }
int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *buffer) { BLASLONG i, is, min_i; #if (TRANSA == 2) || (TRANSA == 4) OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; #endif FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } for (is =0; is < m; is += DTB_ENTRIES) { min_i = MIN(m - is, DTB_ENTRIES); #if (TRANSA) == 1 || (TRANSA == 3) if (is > 0) { #if TRANSA == 1 GEMV_N(is, min_i, 0, dp1, ZERO, a + is * lda * 2, lda, B + is * 2, 1, B, 1, gemvbuffer); #else GEMV_R(is, min_i, 0, dp1, ZERO, a + is * lda * 2, lda, B + is * 2, 1, B, 1, gemvbuffer); #endif } #endif for (i = 0; i < min_i; i++) { FLOAT *AA = a + (is + (i + is) * lda) * 2; FLOAT *BB = B + is * 2; #if (TRANSA == 1) || (TRANSA == 3) #if TRANSA == 1 if (i > 0) AXPYU_K (i, 0, 0, BB[i * 2 + 0], BB[i * 2 + 1], AA, 1, BB, 1, NULL, 0); #else if (i > 0) AXPYC_K(i, 0, 0, BB[i * 2 + 0], BB[i * 2 + 1], AA, 1, BB, 1, NULL, 0); #endif #endif #ifndef UNIT atemp1 = AA[i * 2 + 0]; atemp2 = AA[i * 2 + 1]; btemp1 = BB[i * 2 + 0]; btemp2 = BB[i * 2 + 1]; #if (TRANSA == 1) || (TRANSA == 2) BB[i * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2; BB[i * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1; #else BB[i * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2; BB[i * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1; #endif #endif #if (TRANSA == 2) || (TRANSA == 4) if (i < min_i - 1) { #if TRANSA == 2 temp = DOTU_K(min_i - i - 1, AA + (i + 1) * 2, 1, BB + (i + 1) * 2, 1); #else temp = DOTC_K(min_i - i - 1, AA + (i + 1) * 2, 1, BB + (i + 1) * 2, 1); #endif BB[i * 2 + 0] += CREAL(temp); BB[i * 2 + 1] += CIMAG(temp); } #endif } #if (TRANSA) == 2 || (TRANSA == 4) if (m - is > min_i) { #if TRANSA == 2 GEMV_T(m - is - min_i, min_i, 0, dp1, ZERO, a + (is + min_i + is * lda) * 2, lda, B + (is + min_i) * 2, 1, B + is * 2, 1, gemvbuffer); #else GEMV_C(m - is - min_i, min_i, 0, dp1, ZERO, a + (is + min_i + is * lda) * 2, lda, B + (is + min_i) * 2, 1, B + is * 2, 1, gemvbuffer); #endif } #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; }
int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; #if (TRANSA == 2) || (TRANSA == 4) FLOAT _Complex temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; #endif FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } a += (m + 1) * m - 2; for (i = 0; i < m; i++) { #if (TRANSA == 1) || (TRANSA == 3) #if TRANSA == 1 if (i > 0) AXPYU_K (i, 0, 0, B[(m - i - 1) * 2 + 0], B[(m - i - 1) * 2 + 1], a + 2, 1, B + (m - i) * 2, 1, NULL, 0); #else if (i > 0) AXPYC_K(i, 0, 0, B[(m - i - 1) * 2 + 0], B[(m - i - 1) * 2 + 1], a + 2, 1, B + (m - i) * 2, 1, NULL, 0); #endif #endif #ifndef UNIT atemp1 = a[0]; atemp2 = a[1]; btemp1 = B[(m - i - 1) * 2 + 0]; btemp2 = B[(m - i - 1) * 2 + 1]; #if (TRANSA == 1) || (TRANSA == 2) B[(m - i - 1) * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2; B[(m - i - 1) * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1; #else B[(m - i - 1) * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2; B[(m - i - 1) * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1; #endif #endif #if (TRANSA == 2) || (TRANSA == 4) if (i < m - 1) { #if TRANSA == 2 temp = DOTU_K(m - i - 1, a - (m - i - 1) * 2, 1, B, 1); #else temp = DOTC_K(m - i - 1, a - (m - i - 1) * 2, 1, B, 1); #endif B[(m - i - 1) * 2 + 0] += CREAL(temp); B[(m - i - 1) * 2 + 1] += CIMAG(temp); } #endif #if (TRANSA == 1) || (TRANSA == 3) a -= (i + 2) * 2; #else a -= (m - i) * 2; #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; }
int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ BLASLONG i; FLOAT *X = x; FLOAT *Y = y; FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *bufferY = gemvbuffer; FLOAT *bufferX = gemvbuffer; FLOAT temp[2]; if (incy != 1) { Y = bufferY; bufferX = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) * 2 + 4095) & ~4095); gemvbuffer = bufferX; COPY_K(m, y, incy, Y, 1); } if (incx != 1) { X = bufferX; gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + m * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(m, x, incx, X, 1); } for (i = 0; i < m; i++) { #ifndef HEMVREV #ifndef LOWER if (i > 0) { FLOAT _Complex result = DOTC_K(i, a, 1, X, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } temp[0] = a[i * 2 + 0] * X[i * 2 + 0]; temp[1] = a[i * 2 + 0] * X[i * 2 + 1]; Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (i > 0) { AXPYU_K(i, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a, 1, Y, 1, NULL, 0); } a += (i + 1) * 2; #else if (m - i > 1) { FLOAT _Complex result = DOTC_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } temp[0] = a[i * 2 + 0] * X[i * 2 + 0]; temp[1] = a[i * 2 + 0] * X[i * 2 + 1]; Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (m - i > 1) { AXPYU_K(m - i - 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0); } a += (m - i - 1) * 2; #endif #else #ifndef LOWER if (i > 0) { FLOAT _Complex result = DOTU_K(i, a, 1, X, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } temp[0] = a[i * 2 + 0] * X[i * 2 + 0]; temp[1] = a[i * 2 + 0] * X[i * 2 + 1]; Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (i > 0) { AXPYC_K(i, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a, 1, Y, 1, NULL, 0); } a += (i + 1) * 2; #else if (m - i > 1) { FLOAT _Complex result = DOTU_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } temp[0] = a[i * 2 + 0] * X[i * 2 + 0]; temp[1] = a[i * 2 + 0] * X[i * 2 + 1]; Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (m - i > 1) { AXPYC_K(m - i - 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0); } a += (m - i - 1) * 2; #endif #endif } if (incy != 1) { COPY_K(m, Y, 1, y, incy); } return 0; }
int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ #else int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ #endif blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG range_n[MAX_CPU_NUMBER]; BLASLONG width, i, num_cpu; double dnum; int mask = 7; #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif args.n = n; args.k = k; args.a = (void *)a; args.b = (void *)x; args.c = (void *)buffer; args.lda = lda; args.ldb = incx; args.ldc = incy; dnum = (double)n * (double)n / (double)nthreads; num_cpu = 0; if (n < 2 * k) { #ifndef LOWER range_m[MAX_CPU_NUMBER] = n; i = 0; while (i < n){ if (nthreads - num_cpu > 1) { double di = (double)(n - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = n - i; } if (width < 16) width = 16; if (width > n - i) width = n - i; } else { width = n - i; } range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); queue[num_cpu].mode = mode; queue[num_cpu].routine = sbmv_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; queue[num_cpu].range_n = &range_n[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #else range_m[0] = 0; i = 0; while (i < n){ if (nthreads - num_cpu > 1) { double di = (double)(n - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = n - i; } if (width < 16) width = 16; if (width > n - i) width = n - i; } else { width = n - i; } range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); queue[num_cpu].mode = mode; queue[num_cpu].routine = sbmv_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[num_cpu]; queue[num_cpu].range_n = &range_n[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #endif } else { range_m[0] = 0; i = n; while (i > 0){ width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); if (width < 4) width = 4; if (i < width) width = i; range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * ((n + 15) & ~15); queue[num_cpu].mode = mode; queue[num_cpu].routine = sbmv_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[num_cpu]; queue[num_cpu].range_n = &range_n[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i -= width; } } if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer; queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } for (i = 1; i < num_cpu; i ++) { AXPYU_K(n, 0, 0, #ifndef COMPLEX ONE, #else ONE, ZERO, #endif (FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0); } AXPYU_K(n, 0, 0, #ifndef COMPLEX alpha, #else alpha[0], alpha[1], #endif buffer, 1, y, incy, NULL, 0); return 0; }
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ blasint n = *N; blasint incx = *INCX; blasint incy = *INCY; #else void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ #endif FLOAT alpha_r = *(ALPHA + 0); FLOAT alpha_i = *(ALPHA + 1); #ifdef SMP int mode, nthreads; #endif #ifndef CBLAS PRINT_DEBUG_CNAME; #else PRINT_DEBUG_CNAME; #endif if (n <= 0) return; if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (n - 1) * incx * 2; if (incy < 0) y -= (n - 1) * incy * 2; #ifdef SMP nthreads = num_cpu_avail(1); if (nthreads == 1) { #endif #ifndef CONJ AXPYU_K (n, 0, 0, alpha_r, alpha_i, x, incx, y, incy, NULL, 0); #else AXPYC_K(n, 0, 0, alpha_r, alpha_i, x, incx, y, incy, NULL, 0); #endif #ifdef SMP } else { #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif blas_level1_thread(mode, n, 0, 0, ALPHA, x, incx, y, incy, NULL, 0, #ifndef CONJ (void *)AXPYU_K, #else (void *)AXPYC_K, #endif nthreads); } #endif FUNCTION_PROFILE_END(4, 2 * n, 2 * n); IDEBUG_END; return; }
int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){ #else int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){ #endif blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER + 1]; BLASLONG range_n[MAX_CPU_NUMBER]; BLASLONG width, i, num_cpu; double dnum; int mask = 7; #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif args.m = m; args.a = (void *)a; args.b = (void *)x; args.c = (void *)(buffer); args.ldb = incx; args.ldc = incx; dnum = (double)m * (double)m / (double)nthreads; num_cpu = 0; #ifndef LOWER range_m[MAX_CPU_NUMBER] = m; i = 0; while (i < m){ if (nthreads - num_cpu > 1) { double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = m - i; } if (width < 16) width = 16; if (width > m - i) width = m - i; } else { width = m - i; } range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); queue[num_cpu].mode = mode; queue[num_cpu].routine = tpmv_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; queue[num_cpu].range_n = &range_n[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #else range_m[0] = 0; i = 0; while (i < m){ if (nthreads - num_cpu > 1) { double di = (double)(m - i); if (di * di - dnum > 0) { width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; } else { width = m - i; } if (width < 16) width = 16; if (width > m - i) width = m - i; } else { width = m - i; } range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); queue[num_cpu].mode = mode; queue[num_cpu].routine = tpmv_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[num_cpu]; queue[num_cpu].range_n = &range_n[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i += width; } #endif if (num_cpu) { queue[0].sa = NULL; queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } #ifndef TRANS for (i = 1; i < num_cpu; i ++) { #ifndef LOWER AXPYU_K(range_m[MAX_CPU_NUMBER - i], 0, 0, ONE, #ifdef COMPLEX ZERO, #endif buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); #else AXPYU_K(m - range_m[i], 0, 0, ONE, #ifdef COMPLEX ZERO, #endif buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0); #endif } #endif COPY_K(m, buffer, 1, x, incx); return 0; }
int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ BLASLONG i, length; #ifndef LOWER BLASLONG offset; #endif FLOAT *X = x; FLOAT *Y = y; FLOAT *sbmvbuffer = (FLOAT *)buffer; FLOAT *bufferY = sbmvbuffer; FLOAT *bufferX = sbmvbuffer; if (incy != 1) { Y = bufferY; bufferX = (FLOAT *)(((BLASLONG)bufferY + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); sbmvbuffer = bufferX; COPY_K(n, y, incy, Y, 1); } if (incx != 1) { X = bufferX; sbmvbuffer = (FLOAT *)(((BLASLONG)bufferX + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); COPY_K(n, x, incx, X, 1); } #ifndef LOWER offset = k; #endif for (i = 0; i < n; i++) { #ifndef LOWER length = k - offset; AXPYU_K(length + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a + offset * COMPSIZE, 1, Y + (i - length) * COMPSIZE, 1, NULL, 0); if (length > 0) { FLOAT _Complex result = DOTU_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } if (offset > 0) offset --; #else length = k; if (n - i - 1 < k) length = n - i - 1; AXPYU_K(length + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a, 1, Y + i * COMPSIZE, 1, NULL, 0); if (length > 0) { FLOAT _Complex result = DOTU_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } #endif a += lda * 2; } if (incy != 1) { COPY_K(n, Y, 1, y, incy); } return 0; }
int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ #else int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ #endif blas_arg_t args; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_m[MAX_CPU_NUMBER]; BLASLONG range_n[MAX_CPU_NUMBER + 1]; BLASLONG width, i, num_cpu; #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif args.m = m; args.n = n; args.a = (void *)a; args.b = (void *)x; args.c = (void *)buffer; args.lda = lda; args.ldb = incx; args.ldc = ku; args.ldd = kl; num_cpu = 0; range_n[0] = 0; i = n; while (i > 0){ width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); if (width < 4) width = 4; if (i < width) width = i; range_n[num_cpu + 1] = range_n[num_cpu] + width; #ifndef TRANSA range_m[num_cpu] = num_cpu * ((m + 15) & ~15); #else range_m[num_cpu] = num_cpu * ((n + 15) & ~15); #endif queue[num_cpu].mode = mode; queue[num_cpu].routine = gbmv_kernel; queue[num_cpu].args = &args; queue[num_cpu].range_m = &range_m[num_cpu]; queue[num_cpu].range_n = &range_n[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; num_cpu ++; i -= width; } if (num_cpu) { queue[0].sa = NULL; #ifndef TRANSA queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; #else queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE; #endif queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); } for (i = 1; i < num_cpu; i ++) { AXPYU_K( #ifndef TRANSA m, #else n, #endif 0, 0, #ifndef COMPLEX ONE, #else ONE, ZERO, #endif buffer + range_m[i] * COMPSIZE, 1, buffer, 1, NULL, 0); } AXPYU_K( #ifndef TRANSA m, #else n, #endif 0, 0, #ifndef COMPLEX alpha, #else alpha[0], alpha[1], #endif buffer, 1, y, incy, NULL, 0); return 0; }
int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer){ BLASLONG i; FLOAT *X, *Y; X = x; Y = y; lda *= 2; if (incx != 1) { COPY_K(m, x, incx, buffer, 1); X = buffer; } if (incy != 1) { COPY_K(m, y, incy, (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)), 1); Y = (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)); } for (i = 0; i < m; i++){ #ifndef HEMVREV #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], - alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1], Y, 1, a, 1, NULL, 0); AXPYU_K(i + 1, 0, 0, alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], alpha_i * Y[i * 2 + 0] - alpha_r * Y[i * 2 + 1], X, 1, a, 1, NULL, 0); a[i * 2 + 1] = ZERO; a += lda; #else AXPYU_K(m - i, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], - alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1], Y + i * 2, 1, a, 1, NULL, 0); AXPYU_K(m - i, 0, 0, alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], alpha_i * Y[i * 2 + 0] - alpha_r * Y[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); a[1] = ZERO; a += 2 + lda; #endif #else #ifndef LOWER AXPYC_K(i + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], Y, 1, a, 1, NULL, 0); AXPYC_K(i + 1, 0, 0, alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], - alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], X, 1, a, 1, NULL, 0); a[i * 2 + 1] = ZERO; a += lda; #else AXPYC_K(m - i, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], Y + i * 2, 1, a, 1, NULL, 0); AXPYC_K(m - i, 0, 0, alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], - alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); a[1] = ZERO; a += 2 + lda; #endif #endif } return 0; }
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; FLOAT alpha = *ALPHA; #else void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ #endif #ifdef SMP int mode, nthreads; #endif #ifndef CBLAS PRINT_DEBUG_NAME; #else PRINT_DEBUG_CNAME; #endif if (n <= 0) return; if (alpha == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (n - 1) * incx; if (incy < 0) y -= (n - 1) * incy; #ifdef SMP nthreads = num_cpu_avail(1); //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. if (incx == 0 || incy == 0) nthreads = 1; //Temporarily work-around the low performance issue with small imput size & //multithreads. if (n <= 10000) nthreads = 1; if (nthreads == 1) { #endif AXPYU_K(n, 0, 0, alpha, x, incx, y, incy, NULL, 0); #ifdef SMP } else { #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif blas_level1_thread(mode, n, 0, 0, &alpha, x, incx, y, incy, NULL, 0, (void *)AXPYU_K, nthreads); } #endif FUNCTION_PROFILE_END(1, 2 * n, 2 * n); IDEBUG_END; return; }
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; #endif if (incb != 1) { B = buffer; gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } for (i = 0; i < n; i++) { #if (TRANSA == 1) || (TRANSA == 3) length = i; if (length > k) length = k; if (length > 0) { #if TRANSA == 1 AXPYU_K(length, 0, 0, B[i * 2 + 0], B[i * 2 + 1], a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1, NULL, 0); #else AXPYC_K(length, 0, 0, B[i * 2 + 0], B[i * 2 + 1], a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1, NULL, 0); #endif } #endif #ifndef UNIT #if (TRANSA == 1) || (TRANSA == 3) atemp1 = a[k * 2 + 0]; atemp2 = a[k * 2 + 1]; #else atemp1 = a[0]; atemp2 = a[1]; #endif btemp1 = B[i * 2 + 0]; btemp2 = B[i * 2 + 1]; #if (TRANSA == 1) || (TRANSA == 2) B[i * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2; B[i * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1; #else B[i * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2; B[i * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1; #endif #endif #if (TRANSA == 2) || (TRANSA == 4) length = n - i - 1; if (length > k) length = k; if (length > 0) { #if TRANSA == 2 temp = DOTU_K(length, a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1); #else temp = DOTC_K(length, a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1); #endif B[i * 2 + 0] += CREAL(temp); B[i * 2 + 1] += CIMAG(temp); } #endif a += lda * COMPSIZE; } if (incb != 1) { COPY_K(n, buffer, 1, b, incb); } return 0; }
static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG lda, incx, incy; BLASLONG i, m_from, m_to; FLOAT alpha_r; #ifdef COMPLEX FLOAT alpha_i; #endif x = (FLOAT *)args -> a; y = (FLOAT *)args -> b; a = (FLOAT *)args -> c; incx = args -> lda; incy = args -> ldb; lda = args -> ldc; alpha_r = *((FLOAT *)args -> alpha + 0); #ifdef COMPLEX alpha_i = *((FLOAT *)args -> alpha + 1); #endif m_from = 0; m_to = args -> m; if (range_m) { m_from = *(range_m + 0); m_to = *(range_m + 1); } if (incx != 1) { #ifndef LOWER COPY_K(m_to, x, incx, buffer, 1); #else COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); #endif x = buffer; buffer += ((COMPSIZE * args -> m + 1023) & ~1023); } if (incy != 1) { #ifndef LOWER COPY_K(m_to, y, incy, buffer, 1); #else COPY_K(args -> m - m_from, y + m_from * incy * COMPSIZE, incy, buffer + m_from * COMPSIZE, 1); #endif y = buffer; } #ifndef LOWER a += (m_from + 1) * m_from / 2 * COMPSIZE; #else a += (2 * args -> m - m_from + 1) * m_from / 2 * COMPSIZE; #endif for (i = m_from; i < m_to; i++){ #if !defined(HEMV) && !defined(HEMVREV) #ifndef COMPLEX if (x[i] != ZERO) { #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * x[i], y, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i], y + i, 1, a, 1, NULL, 0); #endif } if (y[i] != ZERO) { #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * y[i], x, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * y[i], x + i, 1, a, 1, NULL, 0); #endif } #else if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], y, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], y + i * COMPSIZE, 1, a, 1, NULL, 0); #endif } if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) { #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * y[i * COMPSIZE + 0] - alpha_i * y[i * COMPSIZE + 1], alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * y[i * COMPSIZE + 0] - alpha_i * y[i * COMPSIZE + 1], alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], x + i * COMPSIZE, 1, a, 1, NULL, 0); #endif } #endif #else if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { #ifndef HEMVREV #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], - alpha_i * x[i * COMPSIZE + 0] - alpha_r * x[i * COMPSIZE + 1], y, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], - alpha_i * x[i * COMPSIZE + 0] - alpha_r * x[i * COMPSIZE + 1], y + i * COMPSIZE, 1, a, 1, NULL, 0); #endif #else #ifndef LOWER AXPYC_K(i + 1, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], y, 1, a, 1, NULL, 0); #else AXPYC_K(args -> m - i, 0, 0, alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], y + i * COMPSIZE, 1, a, 1, NULL, 0); #endif #endif } if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) { #ifndef HEMVREV #ifndef LOWER AXPYU_K(i + 1, 0, 0, alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], alpha_i * y[i * COMPSIZE + 0] - alpha_r * y[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); #else AXPYU_K(args -> m - i, 0, 0, alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], alpha_i * y[i * COMPSIZE + 0] - alpha_r * y[i * COMPSIZE + 1], x + i * COMPSIZE, 1, a, 1, NULL, 0); #endif #else #ifndef LOWER AXPYC_K(i + 1, 0, 0, alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], - alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], x, 1, a, 1, NULL, 0); #else AXPYC_K(args -> m - i, 0, 0, alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], - alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], x + i * COMPSIZE, 1, a, 1, NULL, 0); #endif #endif } #ifndef LOWER a[i * COMPSIZE + 1] = ZERO; #else a[ 1] = ZERO; #endif #endif #ifndef LOWER a += (i + 1) * COMPSIZE; #else a += (args -> m - i) * COMPSIZE; #endif } return 0; }
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; #endif if (incb != 1) { B = buffer; COPY_K(n, b, incb, buffer, 1); } for (i = 0; i < n; i++) { #if (TRANSA == 2) || (TRANSA == 4) length = i; if (length > k) length = k; if (length > 0) { #if TRANSA == 2 temp = DOTU_K(length, a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1); #else temp = DOTC_K(length, a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1); #endif B[i * 2 + 0] -= CREAL(temp); B[i * 2 + 1] -= CIMAG(temp); } #endif #ifndef UNIT #if (TRANSA == 1) || (TRANSA == 3) ar = a[0]; ai = a[1]; #else ar = a[k * 2 + 0]; ai = a[k * 2 + 1]; #endif if (fabs(ar) >= fabs(ai)){ ratio = ai / ar; den = 1./(ar * ( 1 + ratio * ratio)); ar = den; #if TRANSA < 3 ai = -ratio * den; #else ai = ratio * den; #endif } else { ratio = ar / ai; den = 1./(ai * ( 1 + ratio * ratio)); ar = ratio * den; #if TRANSA < 3 ai = -den; #else ai = den; #endif } br = B[i * 2 + 0]; bi = B[i * 2 + 1]; B[i * 2 + 0] = ar*br - ai*bi; B[i * 2 + 1] = ar*bi + ai*br; #endif #if (TRANSA == 1) || (TRANSA == 3) length = n - i - 1; if (length > k) length = k; if (length > 0) { #if TRANSA == 1 AXPYU_K(length, 0, 0, -B[i * 2 + 0], -B[i * 2 + 1], a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1, NULL, 0); #else AXPYC_K(length, 0, 0, -B[i * 2 + 0], -B[i * 2 + 1], a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1, NULL, 0); #endif } #endif a += lda * COMPSIZE; } if (incb != 1) { COPY_K(n, buffer, 1, b, incb); } return 0; }