예제 #1
0
파일: tpmv_thread.c 프로젝트: 4ker/OpenBLAS
static int tpmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){

  FLOAT *a, *x, *y;

  BLASLONG incx;
  BLASLONG m_from, m_to;
  BLASLONG i;

#ifdef TRANS
#ifndef COMPLEX
  FLOAT          result;
#else
  OPENBLAS_COMPLEX_FLOAT result;
#endif
#endif

#if defined(COMPLEX) && !defined(UNIT)
  FLOAT ar, ai, xr, xi;
#endif

  a = (FLOAT *)args -> a;
  x = (FLOAT *)args -> b;
  y = (FLOAT *)args -> c;

  incx = args -> ldb;

  m_from = 0;
  m_to   = args -> m;

  if (range_m) {
    m_from = *(range_m + 0);
    m_to   = *(range_m + 1);
  }

  if (incx != 1) {

#ifndef LOWER
    COPY_K(m_to, x, incx, buffer, 1);
#else
    COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1);
#endif

    x = buffer;
    buffer += ((COMPSIZE * args -> m + 1023) & ~1023);
  }

#ifndef TRANS
  if (range_n) y += *range_n * COMPSIZE;

#ifndef LOWER
  SCAL_K(m_to, 0, 0, ZERO,
#ifdef COMPLEX
	 ZERO,
#endif
	 y, 1, NULL, 0, NULL, 0);
#else
  SCAL_K(args -> m - m_from, 0, 0, ZERO,
#ifdef COMPLEX
	 ZERO,
#endif
	 y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);
#endif

#else

  SCAL_K(m_to - m_from, 0, 0, ZERO,
#ifdef COMPLEX
	 ZERO,
#endif
	 y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);

#endif

#ifndef LOWER
  a += (m_from + 1) * m_from / 2  * COMPSIZE;
#else
  a += (2 * args -> m - m_from - 1) * m_from / 2  * COMPSIZE;
#endif

  for (i = m_from; i < m_to; i++) {

#ifndef LOWER
      if (i > 0) {
#ifndef TRANS
	MYAXPY(i, 0, 0,
		*(x + i * COMPSIZE + 0),
#ifdef COMPLEX
		*(x + i * COMPSIZE + 1),
#endif
		a, 1, y, 1, NULL, 0);
#else
       	result = MYDOT(i,  a, 1, x, 1);

#ifndef COMPLEX
	*(y + i * COMPSIZE + 0) += result;
#else
	*(y + i * COMPSIZE + 0) += CREAL(result);
	*(y + i * COMPSIZE + 1) += CIMAG(result);
#endif

#endif
      }
#endif

#ifndef COMPLEX
#ifdef UNIT
    *(y + i * COMPSIZE) += *(x + i * COMPSIZE);
#else
    *(y + i * COMPSIZE) += *(a + i * COMPSIZE) * *(x + i * COMPSIZE);
#endif
#else
#ifdef UNIT
      *(y + i * COMPSIZE + 0) += *(x + i * COMPSIZE + 0);
      *(y + i * COMPSIZE + 1) += *(x + i * COMPSIZE + 1);
#else
      ar = *(a + i * COMPSIZE + 0);
      ai = *(a + i * COMPSIZE + 1);
      xr = *(x + i * COMPSIZE + 0);
      xi = *(x + i * COMPSIZE + 1);

#if (TRANSA == 1) || (TRANSA == 2)
      *(y + i * COMPSIZE + 0) += ar * xr - ai * xi;
      *(y + i * COMPSIZE + 1) += ar * xi + ai * xr;
#else
      *(y + i * COMPSIZE + 0) += ar * xr + ai * xi;
      *(y + i * COMPSIZE + 1) += ar * xi - ai * xr;
#endif
#endif
#endif

#ifdef LOWER
      if (args -> m > i + 1) {
#ifndef TRANS
	MYAXPY(args -> m - i - 1, 0, 0,
		*(x + i * COMPSIZE + 0),
#ifdef COMPLEX
		*(x + i * COMPSIZE + 1),
#endif
		a + (i + 1 ) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0);
#else

	result = MYDOT(args -> m - i - 1, a + (i + 1) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1);

#ifndef COMPLEX
	*(y + i * COMPSIZE + 0) += result;
#else
	*(y + i * COMPSIZE + 0) += CREAL(result);
	*(y + i * COMPSIZE + 1) += CIMAG(result);
#endif

#endif
      }
#endif

#ifndef LOWER
    a += (i + 1) * COMPSIZE;
#else
    a += (args -> m - i - 1) * COMPSIZE;
#endif

  }

  return 0;
}
예제 #2
0
static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){

  FLOAT *a, *x, *y;
  BLASLONG lda, incx;
  BLASLONG n, k, n_from, n_to;
  BLASLONG i, length;
#ifndef COMPLEX
  FLOAT result;
#else
  FLOAT _Complex result;
#endif

  a = (FLOAT *)args -> a;
  x = (FLOAT *)args -> b;

  lda  = args -> lda;
  incx = args -> ldb;

  n = args -> n;
  k = args -> k;

  n_from = 0;
  n_to   = n;

  //Use y as each thread's n* COMPSIZE elements in sb buffer
  y = buffer;   
  buffer += ((COMPSIZE * n  + 1023) & ~1023);

  if (range_m) {
    n_from = *(range_m + 0);
    n_to   = *(range_m + 1);

    a += n_from * lda  * COMPSIZE;
  }


  if (incx != 1) {
    COPY_K(n, x, incx, buffer, 1);

    x = buffer;
    buffer += ((COMPSIZE * n  + 1023) & ~1023);
  }

  SCAL_K(n, 0, 0, ZERO, 
#ifdef COMPLEX
	 ZERO,
#endif
	 y, 1, NULL, 0, NULL, 0);  
  
  for (i = n_from; i < n_to; i++) {

#ifndef LOWER

    length  = i;
    if (length > k) length = k;

    MYAXPY(length, 0, 0,
	   *(x + i * COMPSIZE + 0),
#ifdef COMPLEX
	   *(x + i * COMPSIZE + 1),
#endif
	   a + (k - length) * COMPSIZE, 1, y + (i - length) * COMPSIZE, 1, NULL, 0);

#if !defined(HEMV) && !defined(HEMVREV)
    result = MYDOT(length + 1, a + (k - length) * COMPSIZE, 1, x + (i - length) * COMPSIZE, 1);
#else
    result = MYDOT(length    , a + (k - length) * COMPSIZE, 1, x + (i - length) * COMPSIZE, 1);
#endif

#ifndef COMPLEX
    *(y + i * COMPSIZE + 0) += result;
#else
#if !defined(HEMV) && !defined(HEMVREV)
    *(y + i * COMPSIZE + 0) += CREAL(result);
    *(y + i * COMPSIZE + 1) += CIMAG(result);
#else
    *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + k * COMPSIZE) * *(x + i * COMPSIZE + 0);
    *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + k * COMPSIZE) * *(x + i * COMPSIZE + 1);
#endif
#endif

#else

    length  = k;
    if (n - i - 1 < k) length = n - i - 1;

    MYAXPY(length, 0, 0,
	   *(x + i * COMPSIZE + 0),
#ifdef COMPLEX
	   *(x + i * COMPSIZE + 1),
#endif
	   a + COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0);

#if !defined(HEMV) && !defined(HEMVREV)
    result = MYDOT(length + 1, a, 1, x + i * COMPSIZE, 1);
#else
    result = MYDOT(length    , a + COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1) ;
#endif

#ifndef COMPLEX
    *(y + i * COMPSIZE + 0) += result;
#else
#if !defined(HEMV) && !defined(HEMVREV)
    *(y + i * COMPSIZE + 0) += CREAL(result);
    *(y + i * COMPSIZE + 1) += CIMAG(result);
#else
    *(y + i * COMPSIZE + 0) += CREAL(result) + *a * *(x + i * COMPSIZE + 0);
    *(y + i * COMPSIZE + 1) += CIMAG(result) + *a * *(x + i * COMPSIZE + 1);
#endif
#endif

#endif

    a += lda * COMPSIZE;
  }

  return 0;
}
예제 #3
0
파일: gbmv_thread.c 프로젝트: 4ker/OpenBLAS
static int gbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){

  FLOAT *a, *x, *y;
  BLASLONG lda, incx;
  BLASLONG n_from, n_to;
  BLASLONG i, offset_l, offset_u, uu, ll, ku, kl;
#ifdef TRANSA
#ifndef COMPLEX
  FLOAT result;
#else
  OPENBLAS_COMPLEX_FLOAT result;
#endif
#endif

  a = (FLOAT *)args -> a;
  x = (FLOAT *)args -> b;
  y = (FLOAT *)args -> c;

  lda  = args -> lda;
  incx = args -> ldb;
  ku   = args -> ldc;
  kl   = args -> ldd;

  n_from = 0;
  n_to   = args -> n;

  if (range_m) y += *range_m * COMPSIZE;

  if (range_n) {
    n_from = *(range_n + 0);
    n_to   = *(range_n + 1);

    a += n_from * lda  * COMPSIZE;
  }

  n_to = MIN(n_to, args -> m + ku);

#ifdef TRANSA
  if (incx != 1) {
    COPY_K(args -> m, x, incx, buffer, 1);

    x = buffer;
    buffer += ((COMPSIZE * args -> m  + 1023) & ~1023);
  }
#endif

  SCAL_K(
#ifndef TRANSA
	 args -> m,
#else
	 args -> n,
#endif
	 0, 0, ZERO,
#ifdef COMPLEX
	 ZERO,
#endif
	 y, 1, NULL, 0, NULL, 0);

  offset_u = ku - n_from;
  offset_l = ku - n_from + args -> m;

#ifndef TRANSA
  x += n_from * incx * COMPSIZE;
  y -= offset_u      * COMPSIZE;
#else
  x -= offset_u      * COMPSIZE;
  y += n_from        * COMPSIZE;
#endif

  for (i = n_from; i < n_to; i++) {

    uu = MAX(offset_u, 0);
    ll = MIN(offset_l, ku + kl + 1);

#ifndef TRANSA
    MYAXPY(ll - uu, 0, 0,
	    *(x + 0),
#ifdef COMPLEX
#ifndef XCONJ
	     *(x + 1),
#else
	    -*(x + 1),
#endif
#endif
	    a + uu * COMPSIZE, 1, y + uu * COMPSIZE, 1, NULL, 0);

    x += incx * COMPSIZE;
#else
    result = MYDOT(ll - uu, a + uu * COMPSIZE, 1, x + uu * COMPSIZE, 1);

#ifndef COMPLEX
    *y = result;
#else
    *(y + 0) += CREAL(result);
#ifndef XCONJ
    *(y + 1) += CIMAG(result);
#else
    *(y + 1) -= CIMAG(result);
#endif
#endif

    x += COMPSIZE;
#endif

    y += COMPSIZE;

    offset_u --;
    offset_l --;

    a += lda * COMPSIZE;
  }

  return 0;
}
예제 #4
0
파일: spmv_thread.c 프로젝트: 4ker/OpenBLAS
static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){

  FLOAT *a, *x, *y;
  BLASLONG incx;
  BLASLONG m_from, m_to, i;
#ifndef COMPLEX
  FLOAT result;
#else
  OPENBLAS_COMPLEX_FLOAT result;
#endif

  a = (FLOAT *)args -> a;
  x = (FLOAT *)args -> b;
  y = (FLOAT *)args -> c;

  incx = args -> ldb;

  m_from = 0;
  m_to   = args -> m;

  if (range_m) {
    m_from = *(range_m + 0);
    m_to   = *(range_m + 1);
  }

  if (range_n) y += *range_n * COMPSIZE;

  if (incx != 1) {
#ifndef LOWER
    COPY_K(m_to, x, incx, buffer, 1);
#else
    COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1);
#endif

    x = buffer;
  }

#ifndef LOWER
  SCAL_K(m_to, 0, 0, ZERO,
#ifdef COMPLEX
	 ZERO,
#endif
	 y, 1, NULL, 0, NULL, 0);
#else
  SCAL_K(args -> m - m_from, 0, 0, ZERO,
#ifdef COMPLEX
	 ZERO,
#endif
	 y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);
#endif

#ifndef LOWER
  a += (m_from + 1) * m_from / 2  * COMPSIZE;
#else
  a += (2 * args -> m - m_from - 1) * m_from / 2  * COMPSIZE;
#endif

  for (i = m_from; i < m_to; i++) {
#ifndef LOWER

#if !defined(HEMV) && !defined(HEMVREV)
    result = MYDOT(i + 1, a, 1, x, 1);
#else
    result = MYDOT(i    , a, 1, x, 1);
#endif

#ifndef COMPLEX
    *(y + i * COMPSIZE) += result;
#else
#if !defined(HEMV) && !defined(HEMVREV)
    *(y + i * COMPSIZE + 0) += CREAL(result);
    *(y + i * COMPSIZE + 1) += CIMAG(result);
#else
    *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 0);
    *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 1);
#endif
#endif

    MYAXPY(i, 0, 0,
	    *(x + i * COMPSIZE + 0),
#ifdef COMPLEX
	    *(x + i * COMPSIZE + 1),
#endif
	    a, 1, y, 1, NULL, 0);

    a += (i + 1) * COMPSIZE;

#else
#if !defined(HEMV) && !defined(HEMVREV)
    result = MYDOT(args -> m - i    , a + i * COMPSIZE, 1, x + i * COMPSIZE, 1);
#else
    result = MYDOT(args -> m - i - 1, a + (i + 1) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1);
#endif

#ifndef COMPLEX
    *(y + i * COMPSIZE) += result;
#else
#if !defined(HEMV) && !defined(HEMVREV)
    *(y + i * COMPSIZE + 0) += CREAL(result);
    *(y + i * COMPSIZE + 1) += CIMAG(result);
#else
    *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 0);
    *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 1);
#endif
#endif

    MYAXPY(args -> m - i - 1, 0, 0,
	    *(x + i * COMPSIZE + 0),
#ifdef COMPLEX
	    *(x + i * COMPSIZE + 1),
#endif
	    a + (i + 1) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0);

    a += (args -> m - i - 1) * COMPSIZE;

#endif
  }

  return 0;
}
예제 #5
0
static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){

  FLOAT *a, *x, *y;

  BLASLONG lda, incx;
  BLASLONG m_from, m_to;
  BLASLONG i, is, min_i;

#ifdef TRANS
#ifndef COMPLEX
  FLOAT          result;
#else
  OPENBLAS_COMPLEX_FLOAT result;
#endif
#endif

#if defined(COMPLEX) && !defined(UNIT)
  FLOAT ar, ai, xr, xi;
#endif

  a = (FLOAT *)args -> a;
  x = (FLOAT *)args -> b;
  y = (FLOAT *)args -> c;

  lda  = args -> lda;
  incx = args -> ldb;

  m_from = 0;
  m_to   = args -> m;

  if (range_m) {
    m_from = *(range_m + 0);
    m_to   = *(range_m + 1);
  }

  if (incx != 1) {

#ifndef LOWER
    COPY_K(m_to, x, incx, buffer, 1);
#else
    COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1);
#endif

    x = buffer;
    buffer += ((COMPSIZE * args -> m + 3) & ~3);
  }

#ifndef TRANS
  if (range_n) y += *range_n * COMPSIZE;

#ifndef LOWER
  SCAL_K(m_to, 0, 0, ZERO,
#ifdef COMPLEX
	 ZERO,
#endif
	 y, 1, NULL, 0, NULL, 0);
#else
  SCAL_K(args -> m - m_from, 0, 0, ZERO,
#ifdef COMPLEX
	 ZERO,
#endif
	 y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);
#endif

#else

  SCAL_K(m_to - m_from, 0, 0, ZERO,
#ifdef COMPLEX
	 ZERO,
#endif
	 y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);

#endif

  for (is = m_from; is < m_to; is += DTB_ENTRIES){

    min_i = MIN(m_to - is, DTB_ENTRIES);

#ifndef LOWER
    if (is > 0){
      MYGEMV(is, min_i, 0,
	     ONE,
#ifdef COMPLEX
	     ZERO,
#endif
	     a + is * lda * COMPSIZE, lda,
#ifndef TRANS
	     x + is * COMPSIZE, 1,
	     y,                 1,
#else
	     x,                 1,
	     y + is * COMPSIZE, 1,
#endif
	     buffer);
    }
#endif

    for (i = is; i < is + min_i; i++) {

#ifndef LOWER
      if (i - is > 0) {
#ifndef TRANS
	MYAXPY(i - is, 0, 0,
		*(x + i * COMPSIZE + 0),
#ifdef COMPLEX
		*(x + i * COMPSIZE + 1),
#endif
		a + (is + i * lda) * COMPSIZE, 1, y + is * COMPSIZE, 1, NULL, 0);
#else

	result = MYDOT(i - is,  a + (is + i * lda) * COMPSIZE, 1, x + is * COMPSIZE, 1);

#ifndef COMPLEX
	*(y + i * COMPSIZE + 0) += result;
#else
	*(y + i * COMPSIZE + 0) += CREAL(result);
	*(y + i * COMPSIZE + 1) += CIMAG(result);
#endif

#endif
      }
#endif

#ifndef COMPLEX
#ifdef UNIT
      *(y + i * COMPSIZE) += *(x + i * COMPSIZE);
#else
      *(y + i * COMPSIZE) += *(a + (i + i * lda) * COMPSIZE) * *(x + i * COMPSIZE);
#endif
#else
#ifdef UNIT
      *(y + i * COMPSIZE + 0) += *(x + i * COMPSIZE + 0);
      *(y + i * COMPSIZE + 1) += *(x + i * COMPSIZE + 1);
#else
      ar = *(a + (i + i * lda) * COMPSIZE + 0);
      ai = *(a + (i + i * lda) * COMPSIZE + 1);
      xr = *(x +  i            * COMPSIZE + 0);
      xi = *(x +  i            * COMPSIZE + 1);

#if (TRANSA == 1) || (TRANSA == 2)
      *(y + i * COMPSIZE + 0) += ar * xr - ai * xi;
      *(y + i * COMPSIZE + 1) += ar * xi + ai * xr;
#else
      *(y + i * COMPSIZE + 0) += ar * xr + ai * xi;
      *(y + i * COMPSIZE + 1) += ar * xi - ai * xr;
#endif
#endif
#endif

#ifdef LOWER
      if (is + min_i > i + 1) {
#ifndef TRANS
	MYAXPY(is + min_i - i - 1, 0, 0,
		*(x + i * COMPSIZE + 0),
#ifdef COMPLEX
		*(x + i * COMPSIZE + 1),
#endif
		a + (i + 1 + i * lda) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0);
#else

	result = MYDOT(is + min_i - i - 1, a + (i + 1 + i * lda) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1);

#ifndef COMPLEX
	*(y + i * COMPSIZE + 0) += result;
#else
	*(y + i * COMPSIZE + 0) += CREAL(result);
	*(y + i * COMPSIZE + 1) += CIMAG(result);
#endif

#endif
      }
#endif
    }

#ifdef LOWER
    if (args -> m >  is + min_i){
      MYGEMV(args -> m - is - min_i, min_i, 0,
	     ONE,
#ifdef COMPLEX
	     ZERO,
#endif
	     a + (is + min_i + is * lda) * COMPSIZE, lda,
#ifndef TRANS
	     x +  is          * COMPSIZE, 1,
	     y + (is + min_i) * COMPSIZE, 1,
#else
	     x + (is + min_i) * COMPSIZE, 1,
	     y +  is          * COMPSIZE, 1,
#endif
	     buffer);
    }
#endif
  }

  return 0;
}