static int tpmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG incx; BLASLONG m_from, m_to; BLASLONG i; #ifdef TRANS #ifndef COMPLEX FLOAT result; #else OPENBLAS_COMPLEX_FLOAT result; #endif #endif #if defined(COMPLEX) && !defined(UNIT) FLOAT ar, ai, xr, xi; #endif a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; y = (FLOAT *)args -> c; incx = args -> ldb; m_from = 0; m_to = args -> m; if (range_m) { m_from = *(range_m + 0); m_to = *(range_m + 1); } if (incx != 1) { #ifndef LOWER COPY_K(m_to, x, incx, buffer, 1); #else COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); #endif x = buffer; buffer += ((COMPSIZE * args -> m + 1023) & ~1023); } #ifndef TRANS if (range_n) y += *range_n * COMPSIZE; #ifndef LOWER SCAL_K(m_to, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y, 1, NULL, 0, NULL, 0); #else SCAL_K(args -> m - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif #else SCAL_K(m_to - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif #ifndef LOWER a += (m_from + 1) * m_from / 2 * COMPSIZE; #else a += (2 * args -> m - m_from - 1) * m_from / 2 * COMPSIZE; #endif for (i = m_from; i < m_to; i++) { #ifndef LOWER if (i > 0) { #ifndef TRANS MYAXPY(i, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a, 1, y, 1, NULL, 0); #else result = MYDOT(i, a, 1, x, 1); #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #endif #endif } #endif #ifndef COMPLEX #ifdef UNIT *(y + i * COMPSIZE) += *(x + i * COMPSIZE); #else *(y + i * COMPSIZE) += *(a + i * COMPSIZE) * *(x + i * COMPSIZE); #endif #else #ifdef UNIT *(y + i * COMPSIZE + 0) += *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += *(x + i * COMPSIZE + 1); #else ar = *(a + i * COMPSIZE + 0); ai = *(a + i * COMPSIZE + 1); xr = *(x + i * COMPSIZE + 0); xi = *(x + i * COMPSIZE + 1); #if (TRANSA == 1) || (TRANSA == 2) *(y + i * COMPSIZE + 0) += ar * xr - ai * xi; *(y + i * COMPSIZE + 1) += ar * xi + ai * xr; #else *(y + i * COMPSIZE + 0) += ar * xr + ai * xi; *(y + i * COMPSIZE + 1) += ar * xi - ai * xr; #endif #endif #endif #ifdef LOWER if (args -> m > i + 1) { #ifndef TRANS MYAXPY(args -> m - i - 1, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + (i + 1 ) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); #else result = MYDOT(args -> m - i - 1, a + (i + 1) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1); #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #endif #endif } #endif #ifndef LOWER a += (i + 1) * COMPSIZE; #else a += (args -> m - i - 1) * COMPSIZE; #endif } return 0; }
static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG lda, incx; BLASLONG n, k, n_from, n_to; BLASLONG i, length; #ifndef COMPLEX FLOAT result; #else FLOAT _Complex result; #endif a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; lda = args -> lda; incx = args -> ldb; n = args -> n; k = args -> k; n_from = 0; n_to = n; //Use y as each thread's n* COMPSIZE elements in sb buffer y = buffer; buffer += ((COMPSIZE * n + 1023) & ~1023); if (range_m) { n_from = *(range_m + 0); n_to = *(range_m + 1); a += n_from * lda * COMPSIZE; } if (incx != 1) { COPY_K(n, x, incx, buffer, 1); x = buffer; buffer += ((COMPSIZE * n + 1023) & ~1023); } SCAL_K(n, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y, 1, NULL, 0, NULL, 0); for (i = n_from; i < n_to; i++) { #ifndef LOWER length = i; if (length > k) length = k; MYAXPY(length, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + (k - length) * COMPSIZE, 1, y + (i - length) * COMPSIZE, 1, NULL, 0); #if !defined(HEMV) && !defined(HEMVREV) result = MYDOT(length + 1, a + (k - length) * COMPSIZE, 1, x + (i - length) * COMPSIZE, 1); #else result = MYDOT(length , a + (k - length) * COMPSIZE, 1, x + (i - length) * COMPSIZE, 1); #endif #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else #if !defined(HEMV) && !defined(HEMVREV) *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #else *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + k * COMPSIZE) * *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + k * COMPSIZE) * *(x + i * COMPSIZE + 1); #endif #endif #else length = k; if (n - i - 1 < k) length = n - i - 1; MYAXPY(length, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); #if !defined(HEMV) && !defined(HEMVREV) result = MYDOT(length + 1, a, 1, x + i * COMPSIZE, 1); #else result = MYDOT(length , a + COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1) ; #endif #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else #if !defined(HEMV) && !defined(HEMVREV) *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #else *(y + i * COMPSIZE + 0) += CREAL(result) + *a * *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += CIMAG(result) + *a * *(x + i * COMPSIZE + 1); #endif #endif #endif a += lda * COMPSIZE; } return 0; }
static int gbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG lda, incx; BLASLONG n_from, n_to; BLASLONG i, offset_l, offset_u, uu, ll, ku, kl; #ifdef TRANSA #ifndef COMPLEX FLOAT result; #else OPENBLAS_COMPLEX_FLOAT result; #endif #endif a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; y = (FLOAT *)args -> c; lda = args -> lda; incx = args -> ldb; ku = args -> ldc; kl = args -> ldd; n_from = 0; n_to = args -> n; if (range_m) y += *range_m * COMPSIZE; if (range_n) { n_from = *(range_n + 0); n_to = *(range_n + 1); a += n_from * lda * COMPSIZE; } n_to = MIN(n_to, args -> m + ku); #ifdef TRANSA if (incx != 1) { COPY_K(args -> m, x, incx, buffer, 1); x = buffer; buffer += ((COMPSIZE * args -> m + 1023) & ~1023); } #endif SCAL_K( #ifndef TRANSA args -> m, #else args -> n, #endif 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y, 1, NULL, 0, NULL, 0); offset_u = ku - n_from; offset_l = ku - n_from + args -> m; #ifndef TRANSA x += n_from * incx * COMPSIZE; y -= offset_u * COMPSIZE; #else x -= offset_u * COMPSIZE; y += n_from * COMPSIZE; #endif for (i = n_from; i < n_to; i++) { uu = MAX(offset_u, 0); ll = MIN(offset_l, ku + kl + 1); #ifndef TRANSA MYAXPY(ll - uu, 0, 0, *(x + 0), #ifdef COMPLEX #ifndef XCONJ *(x + 1), #else -*(x + 1), #endif #endif a + uu * COMPSIZE, 1, y + uu * COMPSIZE, 1, NULL, 0); x += incx * COMPSIZE; #else result = MYDOT(ll - uu, a + uu * COMPSIZE, 1, x + uu * COMPSIZE, 1); #ifndef COMPLEX *y = result; #else *(y + 0) += CREAL(result); #ifndef XCONJ *(y + 1) += CIMAG(result); #else *(y + 1) -= CIMAG(result); #endif #endif x += COMPSIZE; #endif y += COMPSIZE; offset_u --; offset_l --; a += lda * COMPSIZE; } return 0; }
static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG incx; BLASLONG m_from, m_to, i; #ifndef COMPLEX FLOAT result; #else OPENBLAS_COMPLEX_FLOAT result; #endif a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; y = (FLOAT *)args -> c; incx = args -> ldb; m_from = 0; m_to = args -> m; if (range_m) { m_from = *(range_m + 0); m_to = *(range_m + 1); } if (range_n) y += *range_n * COMPSIZE; if (incx != 1) { #ifndef LOWER COPY_K(m_to, x, incx, buffer, 1); #else COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); #endif x = buffer; } #ifndef LOWER SCAL_K(m_to, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y, 1, NULL, 0, NULL, 0); #else SCAL_K(args -> m - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif #ifndef LOWER a += (m_from + 1) * m_from / 2 * COMPSIZE; #else a += (2 * args -> m - m_from - 1) * m_from / 2 * COMPSIZE; #endif for (i = m_from; i < m_to; i++) { #ifndef LOWER #if !defined(HEMV) && !defined(HEMVREV) result = MYDOT(i + 1, a, 1, x, 1); #else result = MYDOT(i , a, 1, x, 1); #endif #ifndef COMPLEX *(y + i * COMPSIZE) += result; #else #if !defined(HEMV) && !defined(HEMVREV) *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #else *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 1); #endif #endif MYAXPY(i, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a, 1, y, 1, NULL, 0); a += (i + 1) * COMPSIZE; #else #if !defined(HEMV) && !defined(HEMVREV) result = MYDOT(args -> m - i , a + i * COMPSIZE, 1, x + i * COMPSIZE, 1); #else result = MYDOT(args -> m - i - 1, a + (i + 1) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1); #endif #ifndef COMPLEX *(y + i * COMPSIZE) += result; #else #if !defined(HEMV) && !defined(HEMVREV) *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #else *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 1); #endif #endif MYAXPY(args -> m - i - 1, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + (i + 1) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); a += (args -> m - i - 1) * COMPSIZE; #endif } return 0; }
static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG lda, incx; BLASLONG m_from, m_to; BLASLONG i, is, min_i; #ifdef TRANS #ifndef COMPLEX FLOAT result; #else OPENBLAS_COMPLEX_FLOAT result; #endif #endif #if defined(COMPLEX) && !defined(UNIT) FLOAT ar, ai, xr, xi; #endif a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; y = (FLOAT *)args -> c; lda = args -> lda; incx = args -> ldb; m_from = 0; m_to = args -> m; if (range_m) { m_from = *(range_m + 0); m_to = *(range_m + 1); } if (incx != 1) { #ifndef LOWER COPY_K(m_to, x, incx, buffer, 1); #else COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); #endif x = buffer; buffer += ((COMPSIZE * args -> m + 3) & ~3); } #ifndef TRANS if (range_n) y += *range_n * COMPSIZE; #ifndef LOWER SCAL_K(m_to, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y, 1, NULL, 0, NULL, 0); #else SCAL_K(args -> m - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif #else SCAL_K(m_to - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif for (is = m_from; is < m_to; is += DTB_ENTRIES){ min_i = MIN(m_to - is, DTB_ENTRIES); #ifndef LOWER if (is > 0){ MYGEMV(is, min_i, 0, ONE, #ifdef COMPLEX ZERO, #endif a + is * lda * COMPSIZE, lda, #ifndef TRANS x + is * COMPSIZE, 1, y, 1, #else x, 1, y + is * COMPSIZE, 1, #endif buffer); } #endif for (i = is; i < is + min_i; i++) { #ifndef LOWER if (i - is > 0) { #ifndef TRANS MYAXPY(i - is, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + (is + i * lda) * COMPSIZE, 1, y + is * COMPSIZE, 1, NULL, 0); #else result = MYDOT(i - is, a + (is + i * lda) * COMPSIZE, 1, x + is * COMPSIZE, 1); #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #endif #endif } #endif #ifndef COMPLEX #ifdef UNIT *(y + i * COMPSIZE) += *(x + i * COMPSIZE); #else *(y + i * COMPSIZE) += *(a + (i + i * lda) * COMPSIZE) * *(x + i * COMPSIZE); #endif #else #ifdef UNIT *(y + i * COMPSIZE + 0) += *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += *(x + i * COMPSIZE + 1); #else ar = *(a + (i + i * lda) * COMPSIZE + 0); ai = *(a + (i + i * lda) * COMPSIZE + 1); xr = *(x + i * COMPSIZE + 0); xi = *(x + i * COMPSIZE + 1); #if (TRANSA == 1) || (TRANSA == 2) *(y + i * COMPSIZE + 0) += ar * xr - ai * xi; *(y + i * COMPSIZE + 1) += ar * xi + ai * xr; #else *(y + i * COMPSIZE + 0) += ar * xr + ai * xi; *(y + i * COMPSIZE + 1) += ar * xi - ai * xr; #endif #endif #endif #ifdef LOWER if (is + min_i > i + 1) { #ifndef TRANS MYAXPY(is + min_i - i - 1, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + (i + 1 + i * lda) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); #else result = MYDOT(is + min_i - i - 1, a + (i + 1 + i * lda) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1); #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #endif #endif } #endif } #ifdef LOWER if (args -> m > is + min_i){ MYGEMV(args -> m - is - min_i, min_i, 0, ONE, #ifdef COMPLEX ZERO, #endif a + (is + min_i + is * lda) * COMPSIZE, lda, #ifndef TRANS x + is * COMPSIZE, 1, y + (is + min_i) * COMPSIZE, 1, #else x + (is + min_i) * COMPSIZE, 1, y + is * COMPSIZE, 1, #endif buffer); } #endif } return 0; }