コード例 #1
0
ファイル: spr2_k.c プロジェクト: 34985086/meshlab
int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT *x, BLASLONG incx, 
		       FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer){

  BLASLONG i;
  FLOAT *X, *Y;

  X = x;
  Y = y;

  if (incx != 1) {
    COPY_K(m, x, incx, buffer, 1);
    X = buffer;
  }

  if (incy != 1) {
    COPY_K(m, y, incy, (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)), 1);
    Y = (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2));
  }

  for (i = 0; i < m; i++){
#ifndef LOWER
    AXPYU_K(i + 1, 0, 0, alpha_r * X[i], Y,     1, a, 1, NULL, 0);
    AXPYU_K(i + 1, 0, 0, alpha_r * Y[i], X,     1, a, 1, NULL, 0);
    a += i + 1;
#else
    AXPYU_K(m - i, 0, 0, alpha_r * X[i], Y + i, 1, a, 1, NULL, 0);
    AXPYU_K(m - i, 0, 0, alpha_r * Y[i], X + i, 1, a, 1, NULL, 0);
    a += m - i;
#endif
    }

  return 0;
}
コード例 #2
0
ファイル: spr_k.c プロジェクト: 34985086/meshlab
int CNAME(BLASLONG m, FLOAT alpha_r, 
	  FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *buffer){

  BLASLONG i;
  FLOAT *X;

  X = x;

  if (incx != 1) {
    COPY_K(m, x, incx, buffer, 1);
    X = buffer;
  }

  for (i = 0; i < m; i++){
#ifndef LOWER
    if (X[i] != ZERO) {
      AXPYU_K(i + 1, 0, 0, alpha_r * X[i], X,     1, a, 1, NULL, 0);
    }
    a += i + 1;
#else
    if (X[i] != ZERO) {
      AXPYU_K(m - i, 0, 0, alpha_r * X[i], X + i, 1, a, 1, NULL, 0);
    }
    a += m - i;
#endif
    }

  return 0;
}
コード例 #3
0
ファイル: zspr_k.c プロジェクト: 4ker/OpenBLAS
int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i,
	  FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *buffer){

  BLASLONG i;
  FLOAT *X;

  X = x;

  if (incx != 1) {
    COPY_K(m, x, incx, buffer, 1);
    X = buffer;
  }

  for (i = 0; i < m; i++){
#ifndef LOWER
    if ((X[i * 2 + 0] != ZERO) && (X[i * 2 + 1] != ZERO)) {
      AXPYU_K(i + 1, 0, 0,
	      alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
	      alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1],
	      X,         1, a, 1, NULL, 0);
    }
    a += (i + 1) * 2;
#else
    if ((X[i * 2 + 0] != ZERO) && (X[i * 2 + 1] != ZERO)) {
      AXPYU_K(m - i, 0, 0,
	      alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
	      alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1],
	      X + i * 2, 1, a, 1, NULL, 0);
    }
    a += (m - i) * 2;
#endif
    }

  return 0;
}
コード例 #4
0
ファイル: trsv_U.c プロジェクト: AmEv7Fam/opentoonz
int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){

  BLASLONG i, is, min_i;
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;

  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095);
    COPY_K(m, b, incb, buffer, 1);
  }
  
  for (is = m; is > 0; is -= DTB_ENTRIES){

    min_i = MIN(is, DTB_ENTRIES);

#ifdef TRANSA
    if (m - is > 0){
      GEMV_T(m - is, min_i, 0, dm1, 
	     a + is + (is - min_i) * lda, lda,
	     B + is, 1,
	     B + is - min_i, 1, gemvbuffer);
    }
#endif
    
    for (i = 0; i < min_i; i++) {
      FLOAT *AA = a + (is - i - 1) + (is - i - 1) * lda;
      FLOAT *BB = B + (is - i - 1);

#ifdef TRANSA
      if (i > 0) BB[0] -= DOTU_K(i, AA + 1, 1, BB + 1, 1);
#endif

#ifndef UNIT
      BB[0] /= AA[0];
#endif

#ifndef TRANSA
      if (i < min_i - 1) AXPYU_K(min_i - i - 1, 0, 0, -BB[0], AA - (min_i - i - 1), 1, BB - (min_i - i - 1), 1, NULL, 0);
#endif
    }

#ifndef TRANSA
    if (is - min_i > 0){
      GEMV_N(is - min_i, min_i, 0, dm1, 
	     a +  (is - min_i) * lda, lda,
	     B + is - min_i, 1,
	     B,              1, gemvbuffer);
    }
#endif
    
  }

  if (incb != 1) {
    COPY_K(m, buffer, 1, b, incb);
  }

  return 0;
}
コード例 #5
0
ファイル: trmv_U.c プロジェクト: 34985086/meshlab
int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *buffer){

  BLASLONG i, is, min_i;
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;

  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095);
    COPY_K(m, b, incb, buffer, 1);
  }

  for (is = 0; is < m; is += DTB_ENTRIES){
    
    min_i = MIN(m - is, DTB_ENTRIES);

#ifndef TRANSA
    if (is > 0){
      GEMV_N(is, min_i, 0, dp1, 
	     a + is * lda,  lda,
	     B + is, 1,
	     B,      1, gemvbuffer);
    }
#endif

    for (i = 0; i < min_i; i++) {
      FLOAT *AA = a + is + (i + is) * lda;
      FLOAT *BB = B + is;
      
#ifndef TRANSA
      if (i > 0) AXPYU_K(i, 0, 0, BB[i], AA, 1, BB, 1, NULL, 0);
#endif

#ifndef UNIT
      BB[i] *= AA[i];
#endif

#ifdef TRANSA
      if (i < min_i - 1) BB[i] += DOTU_K(min_i - i - 1, AA + i + 1, 1, BB + i + 1, 1);
#endif
    }

#ifdef TRANSA
    if (m - is > min_i){
      GEMV_T(m - is - min_i, min_i, 0, dp1, 
	     a + is + min_i + is * lda,  lda,
	     B + is + min_i, 1,
	     B + is,         1, gemvbuffer);
  }
#endif

  }

  if (incb != 1) {
    COPY_K(m, buffer, 1, b, incb);
  }

  return 0;
}
コード例 #6
0
ファイル: tbsv_U.c プロジェクト: AmEv7Fam/opentoonz
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){

  BLASLONG i;
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;
  BLASLONG length;

  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
    COPY_K(n, b, incb, buffer, 1);
  }
  
  a += (n - 1) * lda;

  for (i = n - 1; i >= 0; i--) {

#ifdef TRANSA
    length  = n - i - 1;
    if (length > k) length = k;

    if (length > 0) {
      B[i] -= DOTU_K(length, a + 1, 1, B + i + 1, 1);
    }
#endif

#ifndef UNIT
#ifdef TRANSA
    B[i] /= a[0];
#else
    B[i] /= a[k];
#endif
#endif
    
#ifndef TRANSA
    length  = i;
    if (length > k) length = k;

    if (length > 0) {
      AXPYU_K(length, 0, 0,
	     - B[i], 
	     a + k - length, 1, B + i - length, 1, NULL, 0);
    }
#endif

    a -= lda;
  }
    
  if (incb != 1) {
    COPY_K(n, buffer, 1, b, incb);
  }

  return 0;
}
コード例 #7
0
ファイル: zhpr_k.c プロジェクト: 34985086/meshlab
int CNAME(BLASLONG m, FLOAT alpha, FLOAT *x,
		      BLASLONG incx, FLOAT *a, FLOAT *buffer){

  BLASLONG i;
  FLOAT *X;

  X = x;

  if (incx != 1) {
    COPY_K(m, x, incx, buffer, 1);
    X = buffer;
  }

  for (i = 0; i < m; i++){
#ifndef HEMVREV
#ifndef LOWER
    AXPYU_K(i + 1, 0, 0, alpha * X[i * 2 + 0], -alpha * X[i * 2 + 1], X,         1, a, 1, NULL, 0);
    a[i * 2 + 1] = ZERO;
    a += (i + 1) * 2;
#else
    AXPYU_K(m - i, 0, 0, alpha * X[i * 2 + 0], -alpha * X[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0);
    a[1] = ZERO;
    a += (m - i) * 2;
#endif
#else
#ifndef LOWER
    AXPYC_K(i + 1, 0, 0, alpha * X[i * 2 + 0], alpha * X[i * 2 + 1], X,         1, a, 1, NULL, 0);
    a[i * 2 + 1] = ZERO;
    a += (i + 1) * 2;
#else
    AXPYC_K(m - i, 0, 0, alpha * X[i * 2 + 0], alpha * X[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0);
    a[1] = ZERO;
    a += (m - i) * 2;
#endif
#endif
    }

  return 0;
}
コード例 #8
0
ファイル: tbmv_U.c プロジェクト: 4ker/OpenBLAS
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){

  BLASLONG i;
  FLOAT *B = b;
  BLASLONG length;

  if (incb != 1) {
    B = buffer;
    COPY_K(n, b, incb, buffer, 1);
  }

  for (i = 0; i < n; i++) {

#ifndef TRANSA
    length  = i;
    if (length > k) length = k;

    if (length > 0) {
      AXPYU_K(length, 0, 0,
	     B[i],
	     a + k - length, 1, B + i - length, 1, NULL, 0);
    }
#endif

#ifndef UNIT
#ifndef TRANSA
    B[i] *= a[k];
#else
    B[i] *= a[0];
#endif
#endif

#ifdef TRANSA
    length  = n - i - 1;
    if (length > k) length = k;

    if (length > 0) {
      B[i] += DOTU_K(length, a + 1, 1, B + i + 1, 1);
    }
#endif

    a += lda;
  }

  if (incb != 1) {
    COPY_K(n, buffer, 1, b, incb);
  }

  return 0;
}
コード例 #9
0
ファイル: tpmv_L.c プロジェクト: AmEv7Fam/opentoonz
int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){

  BLASLONG i;
  FLOAT *B = b;

  if (incb != 1) {
    B = buffer;
    COPY_K(m, b, incb, buffer, 1);
  }
  
  a += (m + 1) * m / 2 - 1;

  for (i = 0; i < m; i++) {
#ifndef TRANSA
    if (i > 0) AXPYU_K(i, 0, 0, B[m - i - 1], a + 1, 1, B + m - i, 1, NULL, 0);
#endif
    
#ifndef UNIT
    B[m - i - 1] *= a[0];
#endif

#ifdef TRANSA
    if (i < m - 1) B[m - i - 1] += DOTU_K(m - i - 1, a - (m - i - 1), 1, B, 1);
#endif

#ifndef TRANSA
    a -= (i + 2);
#else
    a -= (m - i);
#endif
  }
    
  if (incb != 1) {
    COPY_K(m, buffer, 1, b, incb);
  }

  return 0;
}
コード例 #10
0
ファイル: ztrsv_U.c プロジェクト: 34985086/meshlab
int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){

  BLASLONG i, is, min_i;
#if (TRANSA == 2) || (TRANSA == 4)
  FLOAT _Complex result;
#endif
#ifndef UNIT
  FLOAT ar, ai, br, bi, ratio, den;
#endif
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;

  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
    COPY_K(m, b, incb, buffer, 1);
  }

  for (is = m; is > 0; is -= DTB_ENTRIES){

    min_i = MIN(is, DTB_ENTRIES);

#if (TRANSA == 2) || (TRANSA == 4)
    if (m - is > 0){
#if TRANSA == 2
      GEMV_T(m - is, min_i, 0, dm1, ZERO,
	      a + (is + (is - min_i)  * lda) * COMPSIZE, lda,
	      B +  is          * COMPSIZE, 1,
	      B + (is - min_i) * COMPSIZE, 1, gemvbuffer);
#else
      GEMV_C(m - is, min_i, 0, dm1, ZERO,
	      a + (is + (is - min_i)  * lda) * COMPSIZE, lda,
	      B +  is            * COMPSIZE, 1,
	      B + (is - min_i)  * COMPSIZE, 1, gemvbuffer);
#endif
    }
#endif

    for (i = 0; i < min_i; i++) {
      FLOAT *AA = a + ((is - i - 1) + (is - i - 1) * lda) * COMPSIZE;
      FLOAT *BB = B + (is - i - 1) * COMPSIZE;

#if (TRANSA == 2) || (TRANSA == 4)
      if (i > 0) {
#if TRANSA == 2
	result = DOTU_K(i, AA + 2, 1, BB + 2, 1);
#else
	result = DOTC_K(i, AA + 2, 1, BB + 2, 1);
#endif

      BB[0] -= CREAL(result);
      BB[1] -= CIMAG(result);
      }
#endif

#ifndef UNIT
      ar = AA[0];
      ai = AA[1];
      
      if (fabs(ar) >= fabs(ai)){
	ratio = ai / ar;
	den = 1./(ar * ( 1 + ratio * ratio));
	
	ar =  den;
#if TRANSA < 3
	ai = -ratio * den;
#else
	ai =  ratio * den;
#endif
      } else {
	ratio = ar / ai;
	den = 1./(ai * ( 1 + ratio * ratio));
	ar =  ratio * den;
#if TRANSA < 3
	ai = -den;
#else
	ai =  den;
#endif
    }

      br = BB[0];
      bi = BB[1];
      
      BB[0] = ar*br - ai*bi;
      BB[1] = ar*bi + ai*br;
#endif

#if (TRANSA == 1) || (TRANSA == 3)
      if (i < min_i - 1) {
#if TRANSA == 1
	AXPYU_K (min_i - i - 1, 0, 0, - BB[0], -BB[1],
		 AA - (min_i - i - 1) * COMPSIZE, 1, BB - (min_i - i - 1) * COMPSIZE, 1, NULL, 0);
#else
	AXPYC_K(min_i - i - 1, 0, 0, - BB[0], -BB[1],
		 AA - (min_i - i - 1) * COMPSIZE, 1, BB - (min_i - i - 1) * COMPSIZE, 1, NULL, 0);
#endif
      }
#endif
    }

#if (TRANSA == 1) || (TRANSA == 3)
    if (is - min_i > 0){
#if   TRANSA == 1
      GEMV_N(is - min_i, min_i, 0, dm1, ZERO,
	      a +  (is - min_i) * lda * COMPSIZE, lda,
	      B +  (is - min_i)       * COMPSIZE, 1,
	      B,                                  1, gemvbuffer);
#else
      GEMV_R(is - min_i, min_i, 0, dm1, ZERO,
	      a +  (is - min_i) * lda * COMPSIZE, lda,
	      B +  (is - min_i)       * COMPSIZE, 1,
	      B,                                  1, gemvbuffer);
#endif
    }
#endif
  }

  if (incb != 1) {
    COPY_K(m, buffer, 1, b, incb);
  }

  return 0;
}
コード例 #11
0
ファイル: ztrmv_U.c プロジェクト: qiqipipioioi/OpenBLAS
int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *buffer) {

    BLASLONG i, is, min_i;
#if (TRANSA == 2) || (TRANSA == 4)
    OPENBLAS_COMPLEX_FLOAT temp;
#endif
#ifndef UNIT
    FLOAT atemp1, atemp2, btemp1, btemp2;
#endif
    FLOAT *gemvbuffer = (FLOAT *)buffer;
    FLOAT *B = b;

    if (incb != 1) {
        B = buffer;
        gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
        COPY_K(m, b, incb, buffer, 1);
    }

    for (is =0; is < m; is += DTB_ENTRIES) {

        min_i = MIN(m - is, DTB_ENTRIES);

#if (TRANSA) == 1 || (TRANSA == 3)
        if (is > 0) {
#if   TRANSA == 1
            GEMV_N(is, min_i, 0, dp1, ZERO,
                   a + is * lda * 2, lda,
                   B + is       * 2, 1,
                   B,                1, gemvbuffer);
#else
            GEMV_R(is, min_i, 0, dp1, ZERO,
                   a + is * lda * 2, lda,
                   B + is       * 2, 1,
                   B,                1, gemvbuffer);
#endif
        }
#endif

        for (i = 0; i < min_i; i++) {
            FLOAT *AA = a + (is + (i + is) * lda) * 2;
            FLOAT *BB = B + is * 2;

#if (TRANSA == 1) || (TRANSA == 3)
#if   TRANSA == 1
            if (i > 0) AXPYU_K (i, 0, 0, BB[i * 2 + 0],  BB[i * 2 + 1],
                                    AA, 1, BB, 1, NULL, 0);
#else
            if (i > 0) AXPYC_K(i, 0, 0, BB[i * 2 + 0],  BB[i * 2 + 1],
                                   AA, 1, BB, 1, NULL, 0);
#endif
#endif

#ifndef UNIT
            atemp1 = AA[i * 2 + 0];
            atemp2 = AA[i * 2 + 1];

            btemp1 = BB[i * 2 + 0];
            btemp2 = BB[i * 2 + 1];

#if (TRANSA == 1) || (TRANSA == 2)
            BB[i * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2;
            BB[i * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1;
#else
            BB[i * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2;
            BB[i * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1;
#endif
#endif

#if (TRANSA == 2) || (TRANSA == 4)
            if (i < min_i - 1) {
#if TRANSA == 2
                temp = DOTU_K(min_i - i - 1,
                              AA + (i + 1) * 2, 1,
                              BB + (i + 1) * 2, 1);
#else
                temp = DOTC_K(min_i - i - 1,
                              AA + (i + 1) * 2, 1,
                              BB + (i + 1) * 2, 1);
#endif

                BB[i * 2 + 0] += CREAL(temp);
                BB[i * 2 + 1] += CIMAG(temp);
            }
#endif

        }

#if (TRANSA) == 2 || (TRANSA == 4)
        if (m - is > min_i) {
#if TRANSA == 2
            GEMV_T(m - is - min_i, min_i, 0, dp1, ZERO,
                   a + (is + min_i + is * lda) * 2,  lda,
                   B + (is + min_i) * 2, 1,
                   B +  is            * 2, 1, gemvbuffer);
#else
            GEMV_C(m - is - min_i, min_i, 0, dp1, ZERO,
                   a + (is + min_i + is * lda) * 2,  lda,
                   B + (is + min_i) * 2, 1,
                   B +  is            * 2, 1, gemvbuffer);
#endif
        }
#endif
    }

    if (incb != 1) {
        COPY_K(m, buffer, 1, b, incb);
    }

    return 0;
}
コード例 #12
0
ファイル: ztpmv_L.c プロジェクト: BOHICAMAN/OpenBLAS
int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){

  BLASLONG i;
#if (TRANSA == 2) || (TRANSA == 4)
  FLOAT _Complex temp;
#endif
#ifndef UNIT
  FLOAT atemp1, atemp2, btemp1, btemp2;
#endif
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;

  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
    COPY_K(m, b, incb, buffer, 1);
  }

  a += (m + 1) * m - 2;

  for (i = 0; i < m; i++) {

#if (TRANSA == 1) || (TRANSA == 3)
#if TRANSA == 1
      if (i > 0) AXPYU_K (i, 0, 0,
			  B[(m - i - 1) * 2 + 0], B[(m - i - 1) * 2 + 1],
			  a + 2, 1, B + (m - i) * 2, 1, NULL, 0);
#else
      if (i > 0) AXPYC_K(i, 0, 0,
			  B[(m - i - 1) * 2 + 0], B[(m - i - 1) * 2 + 1],
			  a + 2, 1, B + (m - i) * 2, 1, NULL, 0);
#endif
#endif

#ifndef UNIT
      atemp1 = a[0];
      atemp2 = a[1];

      btemp1 = B[(m - i - 1) * 2 + 0];
      btemp2 = B[(m - i - 1) * 2 + 1];

#if (TRANSA == 1) || (TRANSA == 2)
      B[(m - i - 1) * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2;
      B[(m - i - 1) * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1;
#else
      B[(m - i - 1) * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2;
      B[(m - i - 1) * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1;
#endif
#endif

#if (TRANSA == 2) || (TRANSA == 4)
      if (i < m - 1) {
#if   TRANSA == 2
	temp = DOTU_K(m - i - 1, a - (m - i - 1) * 2, 1, B, 1);
#else
	temp = DOTC_K(m - i - 1, a - (m - i - 1) * 2, 1, B, 1);
#endif

	B[(m - i - 1) * 2 + 0] += CREAL(temp);
	B[(m - i - 1) * 2 + 1] += CIMAG(temp);
      }
#endif

#if (TRANSA == 1) || (TRANSA == 3)
    a -= (i + 2) * 2;
#else
    a -= (m - i) * 2;
#endif

    }


  if (incb != 1) {
    COPY_K(m, buffer, 1, b, incb);
  }

  return 0;
}
コード例 #13
0
ファイル: zhpmv_k.c プロジェクト: 34985086/meshlab
int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, 
	 FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){

  BLASLONG i;
  FLOAT *X = x;
  FLOAT *Y = y;
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *bufferY    = gemvbuffer;
  FLOAT *bufferX    = gemvbuffer;
  FLOAT temp[2];

  if (incy != 1) {
    Y = bufferY;
    bufferX    = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
    gemvbuffer = bufferX;
    COPY_K(m, y, incy, Y, 1);
  }

  if (incx != 1) {
    X = bufferX;
    gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
    COPY_K(m, x, incx, X, 1);
  }

  for (i = 0; i < m; i++) {

#ifndef HEMVREV
#ifndef LOWER
    if (i > 0) {
      FLOAT _Complex result = DOTC_K(i, a, 1, X, 1);
      
      Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result);
      Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result);
    }

    temp[0] = a[i * 2 + 0] * X[i * 2 + 0];
    temp[1] = a[i * 2 + 0] * X[i * 2 + 1];

    Y[i * 2 + 0] +=  alpha_r * temp[0] - alpha_i * temp[1];
    Y[i * 2 + 1] +=  alpha_r * temp[1] + alpha_i * temp[0];

    if (i > 0) {
      AXPYU_K(i, 0, 0,
	      alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], 
	      alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], 
	      a, 1, Y, 1, NULL, 0);
    }

    a += (i + 1) * 2;
  
#else

    if (m - i > 1) {
      FLOAT _Complex result = DOTC_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1);
      
      Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result);
      Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result);
    }

    temp[0] = a[i * 2 + 0] * X[i * 2 + 0];
    temp[1] = a[i * 2 + 0] * X[i * 2 + 1];

    Y[i * 2 + 0] +=  alpha_r * temp[0] - alpha_i * temp[1];
    Y[i * 2 + 1] +=  alpha_r * temp[1] + alpha_i * temp[0];

    if (m - i > 1) {
      AXPYU_K(m - i - 1, 0, 0,
	      alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], 
	      alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], 
	      a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0);
    }

    a += (m - i - 1) * 2;

#endif
#else
#ifndef LOWER
    if (i > 0) {
      FLOAT _Complex result = DOTU_K(i, a, 1, X, 1);
      
      Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result);
      Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result);
    }

    temp[0] = a[i * 2 + 0] * X[i * 2 + 0];
    temp[1] = a[i * 2 + 0] * X[i * 2 + 1];

    Y[i * 2 + 0] +=  alpha_r * temp[0] - alpha_i * temp[1];
    Y[i * 2 + 1] +=  alpha_r * temp[1] + alpha_i * temp[0];

    if (i > 0) {
      AXPYC_K(i, 0, 0,
	      alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], 
	      alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], 
	      a, 1, Y, 1, NULL, 0);
    }

    a += (i + 1) * 2;
  
#else

    if (m - i > 1) {
      FLOAT _Complex result = DOTU_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1);
      
      Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result);
      Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result);
    }

    temp[0] = a[i * 2 + 0] * X[i * 2 + 0];
    temp[1] = a[i * 2 + 0] * X[i * 2 + 1];

    Y[i * 2 + 0] +=  alpha_r * temp[0] - alpha_i * temp[1];
    Y[i * 2 + 1] +=  alpha_r * temp[1] + alpha_i * temp[0];

    if (m - i > 1) {
      AXPYC_K(m - i - 1, 0, 0,
	      alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], 
	      alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], 
	      a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0);
    }

    a += (m - i - 1) * 2;

#endif
#endif

  }
    
  if (incy != 1) {
    COPY_K(m, Y, 1, y, incy);
  }

  return 0;
}
コード例 #14
0
ファイル: sbmv_thread.c プロジェクト: Fulguritus/OpenBLAS
int CNAME(BLASLONG n, BLASLONG k, FLOAT  alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){
#else
int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){
#endif

  blas_arg_t args;
  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range_m[MAX_CPU_NUMBER + 1];
  BLASLONG range_n[MAX_CPU_NUMBER];

  BLASLONG width, i, num_cpu;
  double dnum;
  int mask = 7;

#ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE
  int mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  int mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  int mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif  
#else
#ifdef XDOUBLE
  int mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  int mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  int mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif  
#endif
#endif

  args.n = n;
  args.k = k;
  
  args.a = (void *)a;
  args.b = (void *)x;
  args.c = (void *)buffer;
    
  args.lda = lda;
  args.ldb = incx;
  args.ldc = incy;

  dnum = (double)n * (double)n / (double)nthreads;
  num_cpu  = 0;
  
  if (n < 2 * k) {

#ifndef LOWER

    range_m[MAX_CPU_NUMBER] = n;
    i          = 0;
    
    while (i < n){
      
      if (nthreads - num_cpu > 1) {
	
	double di = (double)(n - i);
	if (di * di - dnum > 0) {
	  width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
	} else {
	  width = n - i;
	}
	
	if (width < 16) width = 16;
	if (width > n - i) width = n - i;
	
      } else {
	width = n - i;
      }
      
      range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
      range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16);
      
      queue[num_cpu].mode    = mode;
      queue[num_cpu].routine = sbmv_kernel;
      queue[num_cpu].args    = &args;
      queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1];
      queue[num_cpu].range_n = &range_n[num_cpu];
      queue[num_cpu].sa      = NULL;
      queue[num_cpu].sb      = NULL;
      queue[num_cpu].next    = &queue[num_cpu + 1];
      
      num_cpu ++;
      i += width;
    }
    
#else
    
    range_m[0] = 0;
    i          = 0;
    
    while (i < n){
      
      if (nthreads - num_cpu > 1) {
	
	double di = (double)(n - i);
	if (di * di - dnum > 0) {
	  width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
	} else {
	  width = n - i;
	}
	
	if (width < 16) width = 16;
	if (width > n - i) width = n - i;
	
      } else {
	width = n - i;
      }
      
      range_m[num_cpu + 1] = range_m[num_cpu] + width;
      range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16);
      
      queue[num_cpu].mode    = mode;
      queue[num_cpu].routine = sbmv_kernel;
      queue[num_cpu].args    = &args;
      queue[num_cpu].range_m = &range_m[num_cpu];
      queue[num_cpu].range_n = &range_n[num_cpu];
      queue[num_cpu].sa      = NULL;
      queue[num_cpu].sb      = NULL;
      queue[num_cpu].next    = &queue[num_cpu + 1];
      
      num_cpu ++;
      i += width;
    }
    
#endif
    
  } else {
    
    range_m[0] = 0;
    i          = n;
    
    while (i > 0){
      
      width  = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu);
      
      if (width < 4) width = 4;
      if (i < width) width = i;
      
      range_m[num_cpu + 1] = range_m[num_cpu] + width;
      
      range_n[num_cpu] = num_cpu * ((n + 15) & ~15);
      
      queue[num_cpu].mode    = mode;
      queue[num_cpu].routine = sbmv_kernel;
      queue[num_cpu].args    = &args;
      queue[num_cpu].range_m = &range_m[num_cpu];
      queue[num_cpu].range_n = &range_n[num_cpu];
      queue[num_cpu].sa      = NULL;
      queue[num_cpu].sb      = NULL;
      queue[num_cpu].next    = &queue[num_cpu + 1];
      
      num_cpu ++;
      i -= width;
    }
  }

  if (num_cpu) {
    queue[0].sa = NULL;
    queue[0].sb = buffer;
    queue[num_cpu - 1].next = NULL;
  
    exec_blas(num_cpu, queue);
  }
  
  for (i = 1; i < num_cpu; i ++) {
    AXPYU_K(n, 0, 0,
#ifndef COMPLEX
	    ONE,
#else
	    ONE, ZERO,
#endif
	    (FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0);
  }

  AXPYU_K(n, 0, 0,
#ifndef COMPLEX
	  alpha,
#else
	  alpha[0], alpha[1],
#endif
	  buffer, 1, y, incy, NULL, 0);
  
  return 0;
}
コード例 #15
0
ファイル: zaxpy.c プロジェクト: 34985086/meshlab
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
  
  blasint n    = *N;
  blasint incx = *INCX;
  blasint incy = *INCY;

#else

void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint incy){

#endif

  FLOAT alpha_r = *(ALPHA + 0);
  FLOAT alpha_i = *(ALPHA + 1);

#ifdef SMP
  int mode, nthreads;
#endif

#ifndef CBLAS
  PRINT_DEBUG_CNAME;
#else
  PRINT_DEBUG_CNAME;
#endif

  if (n <= 0) return;

  if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;

  IDEBUG_START;

  FUNCTION_PROFILE_START();

  if (incx < 0) x -= (n - 1) * incx * 2;
  if (incy < 0) y -= (n - 1) * incy * 2;

#ifdef SMP
  nthreads = num_cpu_avail(1);

  if (nthreads == 1) {
#endif

#ifndef CONJ
    AXPYU_K (n, 0, 0, alpha_r, alpha_i, x, incx, y, incy, NULL, 0);
#else
    AXPYC_K(n, 0, 0, alpha_r, alpha_i, x, incx, y, incy, NULL, 0);
#endif
    
#ifdef SMP
  } else {
    
#ifdef XDOUBLE
    mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
    mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
    mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif  
    
    blas_level1_thread(mode, n, 0, 0, ALPHA, x, incx, y, incy, NULL, 0,
#ifndef CONJ
		       (void *)AXPYU_K,
#else
		       (void *)AXPYC_K,
#endif
		       nthreads);
  }
#endif
  
  FUNCTION_PROFILE_END(4, 2 * n, 2 * n);

  IDEBUG_END;

  return;
  
}
コード例 #16
0
ファイル: tpmv_thread.c プロジェクト: 4ker/OpenBLAS
int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){
#else
int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){
#endif

  blas_arg_t args;
  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range_m[MAX_CPU_NUMBER + 1];
  BLASLONG range_n[MAX_CPU_NUMBER];

  BLASLONG width, i, num_cpu;

  double dnum;
  int mask = 7;

#ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE
  int mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  int mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  int mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif
#else
#ifdef XDOUBLE
  int mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  int mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  int mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif
#endif
#endif

  args.m = m;

  args.a = (void *)a;
  args.b = (void *)x;
  args.c = (void *)(buffer);

  args.ldb = incx;
  args.ldc = incx;

  dnum = (double)m * (double)m / (double)nthreads;
  num_cpu  = 0;

#ifndef LOWER

  range_m[MAX_CPU_NUMBER] = m;
  i          = 0;

  while (i < m){

    if (nthreads - num_cpu > 1) {

      double di = (double)(m - i);
      if (di * di - dnum > 0) {
	width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
      } else {
	width = m - i;
      }

      if (width < 16) width = 16;
      if (width > m - i) width = m - i;

    } else {
      width = m - i;
    }

    range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
    range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);

    queue[num_cpu].mode    = mode;
    queue[num_cpu].routine = tpmv_kernel;
    queue[num_cpu].args    = &args;
    queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1];
    queue[num_cpu].range_n = &range_n[num_cpu];
    queue[num_cpu].sa      = NULL;
    queue[num_cpu].sb      = NULL;
    queue[num_cpu].next    = &queue[num_cpu + 1];

    num_cpu ++;
    i += width;
  }

#else

  range_m[0] = 0;
  i          = 0;

  while (i < m){

    if (nthreads - num_cpu > 1) {

      double di = (double)(m - i);
      if (di * di - dnum > 0) {
	width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
      } else {
	width = m - i;
      }

      if (width < 16) width = 16;
      if (width > m - i) width = m - i;

    } else {
      width = m - i;
    }

    range_m[num_cpu + 1] = range_m[num_cpu] + width;
    range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);

    queue[num_cpu].mode    = mode;
    queue[num_cpu].routine = tpmv_kernel;
    queue[num_cpu].args    = &args;
    queue[num_cpu].range_m = &range_m[num_cpu];
    queue[num_cpu].range_n = &range_n[num_cpu];
    queue[num_cpu].sa      = NULL;
    queue[num_cpu].sb      = NULL;
    queue[num_cpu].next    = &queue[num_cpu + 1];

    num_cpu ++;
    i += width;
  }

#endif

  if (num_cpu) {
    queue[0].sa = NULL;
    queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;

    queue[num_cpu - 1].next = NULL;

    exec_blas(num_cpu, queue);
  }

#ifndef TRANS
  for (i = 1; i < num_cpu; i ++) {

#ifndef LOWER

    AXPYU_K(range_m[MAX_CPU_NUMBER - i], 0, 0, ONE,
#ifdef COMPLEX
	    ZERO,
#endif
	    buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0);

#else

    AXPYU_K(m - range_m[i], 0, 0, ONE,
#ifdef COMPLEX
	    ZERO,
#endif
	    buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0);

#endif

  }
#endif

  COPY_K(m, buffer, 1, x, incx);

  return 0;
}
コード例 #17
0
ファイル: zsbmv_k.c プロジェクト: 34985086/meshlab
int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
	  FLOAT *a, BLASLONG lda,
	  FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){

  BLASLONG i, length;
#ifndef LOWER
  BLASLONG offset;
#endif

  FLOAT *X = x;
  FLOAT *Y = y;
  FLOAT *sbmvbuffer = (FLOAT *)buffer;
  FLOAT *bufferY    = sbmvbuffer;
  FLOAT *bufferX    = sbmvbuffer;

  if (incy != 1) {
    Y = bufferY;
    bufferX    = (FLOAT *)(((BLASLONG)bufferY + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095);
    sbmvbuffer = bufferX;
    COPY_K(n, y, incy, Y, 1);
  }

  if (incx != 1) {
    X = bufferX;
    sbmvbuffer = (FLOAT *)(((BLASLONG)bufferX + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095);
    COPY_K(n, x, incx, X, 1);
  }

#ifndef LOWER
  offset = k;
#endif

  for (i = 0; i < n; i++) {

#ifndef LOWER
    length  = k - offset;

    AXPYU_K(length + 1, 0, 0,
	    alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], 
	    alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], 
	    a + offset * COMPSIZE, 1, Y + (i - length) * COMPSIZE, 1, NULL, 0);

    if (length > 0) {
      FLOAT _Complex result = DOTU_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1);

      Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result);
      Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result);
      }

    if (offset > 0) offset --;
#else
    length  = k;
    if (n - i - 1 < k) length = n - i - 1;

    AXPYU_K(length + 1, 0, 0,
	    alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], 
	    alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], 
	    a, 1, Y + i * COMPSIZE, 1, NULL, 0);

    if (length > 0) {
      FLOAT _Complex result = DOTU_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1);
      
      Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result);
      Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result);
      }
#endif
    
    a += lda * 2;
  }

  if (incy != 1) {
    COPY_K(n, Y, 1, y, incy);
  }

  return 0;
}
コード例 #18
0
ファイル: gbmv_thread.c プロジェクト: 4ker/OpenBLAS
int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT  alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){
#else
int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){
#endif

  blas_arg_t args;
  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range_m[MAX_CPU_NUMBER];
  BLASLONG range_n[MAX_CPU_NUMBER + 1];

  BLASLONG width, i, num_cpu;

#ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE
  int mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  int mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  int mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif
#else
#ifdef XDOUBLE
  int mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  int mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  int mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif
#endif
#endif

  args.m = m;
  args.n = n;

  args.a = (void *)a;
  args.b = (void *)x;
  args.c = (void *)buffer;

  args.lda = lda;
  args.ldb = incx;
  args.ldc = ku;
  args.ldd = kl;

  num_cpu  = 0;

  range_n[0] = 0;
  i          = n;

  while (i > 0){

    width  = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu);

    if (width < 4) width = 4;
    if (i < width) width = i;

    range_n[num_cpu + 1] = range_n[num_cpu] + width;

#ifndef TRANSA
    range_m[num_cpu] = num_cpu * ((m + 15) & ~15);
#else
    range_m[num_cpu] = num_cpu * ((n + 15) & ~15);
#endif

    queue[num_cpu].mode    = mode;
    queue[num_cpu].routine = gbmv_kernel;
    queue[num_cpu].args    = &args;
    queue[num_cpu].range_m = &range_m[num_cpu];
    queue[num_cpu].range_n = &range_n[num_cpu];
    queue[num_cpu].sa      = NULL;
    queue[num_cpu].sb      = NULL;
    queue[num_cpu].next    = &queue[num_cpu + 1];

    num_cpu ++;
    i -= width;
  }

  if (num_cpu) {
    queue[0].sa = NULL;
#ifndef TRANSA
    queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;
#else
    queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE;
#endif

    queue[num_cpu - 1].next = NULL;

    exec_blas(num_cpu, queue);
  }

  for (i = 1; i < num_cpu; i ++) {
    AXPYU_K(
#ifndef TRANSA
	    m,
#else
	    n,
#endif
	    0, 0,
#ifndef COMPLEX
	    ONE,
#else
	    ONE, ZERO,
#endif
	    buffer + range_m[i] * COMPSIZE, 1, buffer, 1, NULL, 0);
  }

  AXPYU_K(
#ifndef TRANSA
	    m,
#else
	    n,
#endif
	    0, 0,
#ifndef COMPLEX
	    alpha,
#else
	    alpha[0], alpha[1],
#endif
	    buffer, 1, y, incy, NULL, 0);

  return 0;
}
コード例 #19
0
ファイル: zher2_k.c プロジェクト: 34985086/meshlab
int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i,
	 FLOAT *x, BLASLONG incx, 
	 FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer){

  BLASLONG i;
  FLOAT *X, *Y;

  X = x;
  Y = y;

  lda *= 2;

  if (incx != 1) {
    COPY_K(m, x, incx, buffer, 1);
    X = buffer;
  }

  if (incy != 1) {
    COPY_K(m, y, incy, (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)), 1);
    Y = (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2));
  }

  for (i = 0; i < m; i++){
#ifndef HEMVREV
#ifndef LOWER
    AXPYU_K(i + 1, 0, 0, 
	    alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
	  - alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1],
	    Y,     1, a, 1, NULL, 0);
    AXPYU_K(i + 1, 0, 0,
	    alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1],
	    alpha_i * Y[i * 2 + 0] - alpha_r * Y[i * 2 + 1],
	    X,     1, a, 1, NULL, 0);
    a[i * 2 + 1] = ZERO;
    a += lda;
#else
    AXPYU_K(m - i, 0, 0,
	    alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
	  - alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1],
	    Y + i * 2, 1, a, 1, NULL, 0);
    AXPYU_K(m - i, 0, 0,
	    alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1],
	    alpha_i * Y[i * 2 + 0] - alpha_r * Y[i * 2 + 1],
	    X + i * 2, 1, a, 1, NULL, 0);
    a[1] = ZERO;
    a += 2 + lda;
#endif
#else
#ifndef LOWER
    AXPYC_K(i + 1, 0, 0, 
	    alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
	    alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1],
	    Y,     1, a, 1, NULL, 0);
    AXPYC_K(i + 1, 0, 0,
	    alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1],
	  - alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1],
	    X,     1, a, 1, NULL, 0);
    a[i * 2 + 1] = ZERO;
    a += lda;
#else
    AXPYC_K(m - i, 0, 0,
	    alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
	    alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1],
	    Y + i * 2, 1, a, 1, NULL, 0);
    AXPYC_K(m - i, 0, 0,
	    alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1],
	  - alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1],
	    X + i * 2, 1, a, 1, NULL, 0);
    a[1] = ZERO;
    a += 2 + lda;
#endif
#endif

    }


  return 0;
}
コード例 #20
0
ファイル: axpy.c プロジェクト: 4ker/OpenBLAS
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){

  BLASLONG n    = *N;
  BLASLONG incx = *INCX;
  BLASLONG incy = *INCY;
  FLOAT alpha = *ALPHA;

#else

void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy){

#endif

#ifdef SMP
  int mode, nthreads;
#endif

#ifndef CBLAS
  PRINT_DEBUG_NAME;
#else
  PRINT_DEBUG_CNAME;
#endif

  if (n <= 0) return;

  if (alpha == ZERO) return;

  IDEBUG_START;

  FUNCTION_PROFILE_START();

  if (incx < 0) x -= (n - 1) * incx;
  if (incy < 0) y -= (n - 1) * incy;

#ifdef SMP
  nthreads = num_cpu_avail(1);

  //disable multi-thread when incx==0 or incy==0
  //In that case, the threads would be dependent.
  if (incx == 0 || incy == 0)
	  nthreads = 1;

  //Temporarily work-around the low performance issue with small imput size &
  //multithreads.
  if (n <= 10000)
	  nthreads = 1;

  if (nthreads == 1) {
#endif

  AXPYU_K(n, 0, 0, alpha, x, incx, y, incy, NULL, 0);

#ifdef SMP
  } else {

#ifdef XDOUBLE
    mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
    mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
    mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif

    blas_level1_thread(mode, n, 0, 0, &alpha,
		       x, incx, y, incy, NULL, 0, (void *)AXPYU_K, nthreads);

  }
#endif

  FUNCTION_PROFILE_END(1, 2 * n, 2 * n);

  IDEBUG_END;

  return;

}
コード例 #21
0
ファイル: ztbmv_U.c プロジェクト: QuincyPYoung/OpenBLAS
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){

  BLASLONG i;
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;
  BLASLONG length;
#if (TRANSA == 2) || (TRANSA == 4)
  OPENBLAS_COMPLEX_FLOAT temp;
#endif
#ifndef UNIT
  FLOAT atemp1, atemp2, btemp1, btemp2;
#endif

  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095);
    COPY_K(n, b, incb, buffer, 1);
  }

  for (i = 0; i < n; i++) {

#if (TRANSA == 1) || (TRANSA == 3)
    length  = i;
    if (length > k) length = k;

    if (length > 0) {
#if   TRANSA == 1
      AXPYU_K(length, 0, 0,
	      B[i * 2 + 0],  B[i * 2 + 1],
	      a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1, NULL, 0);
#else
      AXPYC_K(length, 0, 0,
	      B[i * 2 + 0],  B[i * 2 + 1],
	      a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1, NULL, 0);
#endif

    }
#endif

#ifndef UNIT
#if (TRANSA == 1) || (TRANSA == 3)
      atemp1 = a[k * 2 + 0];
      atemp2 = a[k * 2 + 1];
#else
      atemp1 = a[0];
      atemp2 = a[1];
#endif

      btemp1 = B[i * 2 + 0];
      btemp2 = B[i * 2 + 1];

#if (TRANSA == 1) || (TRANSA == 2)
      B[i * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2;
      B[i * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1;
#else
      B[i * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2;
      B[i * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1;
#endif
#endif

#if (TRANSA == 2) || (TRANSA == 4)
    length  = n - i - 1;
    if (length > k) length = k;

    if (length > 0) {
#if TRANSA == 2
      temp = DOTU_K(length, a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1);
#else
      temp = DOTC_K(length, a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1);
#endif

      B[i * 2 + 0] += CREAL(temp);
      B[i * 2 + 1] += CIMAG(temp);
    }
#endif

    a += lda * COMPSIZE;
  }

  if (incb != 1) {
    COPY_K(n, buffer, 1, b, incb);
  }

  return 0;
}
コード例 #22
0
ファイル: spr2_thread.c プロジェクト: BOHICAMAN/OpenBLAS
static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){

  FLOAT *a, *x, *y;
  BLASLONG lda, incx, incy;
  BLASLONG i, m_from, m_to;
  FLOAT alpha_r;
#ifdef COMPLEX
  FLOAT alpha_i;
#endif

  x = (FLOAT *)args -> a;
  y = (FLOAT *)args -> b;
  a = (FLOAT *)args -> c;

  incx = args -> lda;
  incy = args -> ldb;
  lda  = args -> ldc;

  alpha_r = *((FLOAT *)args -> alpha + 0);
#ifdef COMPLEX
  alpha_i = *((FLOAT *)args -> alpha + 1);
#endif

  m_from = 0;
  m_to   = args -> m;

  if (range_m) {
    m_from = *(range_m + 0);
    m_to   = *(range_m + 1);
  }

  if (incx != 1) {
#ifndef LOWER
    COPY_K(m_to, x, incx, buffer, 1);
#else
    COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1);
#endif
    x = buffer;
    buffer += ((COMPSIZE * args -> m  + 1023) & ~1023);
  }

  if (incy != 1) {
#ifndef LOWER
    COPY_K(m_to, y, incy, buffer, 1);
#else
    COPY_K(args -> m - m_from, y + m_from * incy * COMPSIZE, incy, buffer + m_from * COMPSIZE, 1);
#endif
    y = buffer;
  }

#ifndef LOWER
  a += (m_from + 1) * m_from / 2  * COMPSIZE;
#else
  a += (2 * args -> m - m_from + 1) * m_from / 2  * COMPSIZE;
#endif

  for (i = m_from; i < m_to; i++){
#if !defined(HEMV) && !defined(HEMVREV)
#ifndef COMPLEX
    if (x[i] != ZERO) {
#ifndef LOWER
      AXPYU_K(i + 1,         0, 0, alpha_r * x[i], y,     1, a, 1, NULL, 0);
#else
      AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i], y + i, 1, a, 1, NULL, 0);
#endif
    }
    if (y[i] != ZERO) {
#ifndef LOWER
      AXPYU_K(i + 1,         0, 0, alpha_r * y[i], x,     1, a, 1, NULL, 0);
#else
      AXPYU_K(args -> m - i, 0, 0, alpha_r * y[i], x + i, 1, a, 1, NULL, 0);
#endif
    }
#else
    if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) {
#ifndef LOWER
      AXPYU_K(i + 1,         0, 0,
	      alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1],
	      alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1],
	      y,                1, a, 1, NULL, 0);
#else
      AXPYU_K(args -> m - i, 0, 0,
	      alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1],
	      alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1],
	      y + i * COMPSIZE, 1, a, 1, NULL, 0);
#endif
    }
    if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) {
#ifndef LOWER
      AXPYU_K(i + 1,         0, 0,
	      alpha_r * y[i * COMPSIZE + 0] - alpha_i * y[i * COMPSIZE + 1],
	      alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1],
	      x,                1, a, 1, NULL, 0);
#else
      AXPYU_K(args -> m - i, 0, 0,
	      alpha_r * y[i * COMPSIZE + 0] - alpha_i * y[i * COMPSIZE + 1],
	      alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1],
	      x + i * COMPSIZE, 1, a, 1, NULL, 0);
#endif
    }
#endif
#else
    if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) {
#ifndef HEMVREV
#ifndef LOWER
      AXPYU_K(i + 1,         0, 0,
	      alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1],
	    - alpha_i * x[i * COMPSIZE + 0] - alpha_r * x[i * COMPSIZE + 1],
	      y,                1, a, 1, NULL, 0);
#else
      AXPYU_K(args -> m - i, 0, 0,
	      alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1],
	    - alpha_i * x[i * COMPSIZE + 0] - alpha_r * x[i * COMPSIZE + 1],
	      y + i * COMPSIZE, 1, a, 1, NULL, 0);
#endif
#else
#ifndef LOWER
      AXPYC_K(i + 1,         0, 0,
	      alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1],
	      alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1],
	      y,                1, a, 1, NULL, 0);
#else
      AXPYC_K(args -> m - i, 0, 0,
	      alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1],
	      alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1],
	      y + i * COMPSIZE, 1, a, 1, NULL, 0);
#endif
#endif
    }
    if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) {
#ifndef HEMVREV
#ifndef LOWER
      AXPYU_K(i + 1,         0, 0,
	      alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1],
	      alpha_i * y[i * COMPSIZE + 0] - alpha_r * y[i * COMPSIZE + 1],
	      x,                1, a, 1, NULL, 0);
#else
      AXPYU_K(args -> m - i, 0, 0,
	      alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1],
	      alpha_i * y[i * COMPSIZE + 0] - alpha_r * y[i * COMPSIZE + 1],
	      x + i * COMPSIZE, 1, a, 1, NULL, 0);
#endif
#else
#ifndef LOWER
      AXPYC_K(i + 1,         0, 0,
	      alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1],
	    - alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1],
	      x,                1, a, 1, NULL, 0);
#else
      AXPYC_K(args -> m - i, 0, 0,
	      alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1],
	    - alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1],
	      x + i * COMPSIZE, 1, a, 1, NULL, 0);
#endif
#endif
    }
#ifndef LOWER
    a[i * COMPSIZE + 1] = ZERO;
#else
    a[               1] = ZERO;
#endif
#endif

#ifndef LOWER
    a += (i + 1) * COMPSIZE;
#else
    a += (args -> m - i) * COMPSIZE;
#endif
  }

  return 0;
}
コード例 #23
0
ファイル: ztbsv_L.c プロジェクト: 4ker/OpenBLAS
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){

  BLASLONG i;
  FLOAT *B = b;
  BLASLONG length;
#if (TRANSA == 2) || (TRANSA == 4)
  OPENBLAS_COMPLEX_FLOAT temp;
#endif
#ifndef UNIT
  FLOAT ar, ai, br, bi, ratio, den;
#endif

  if (incb != 1) {
    B = buffer;
    COPY_K(n, b, incb, buffer, 1);
  }

  for (i = 0; i < n; i++) {

#if (TRANSA == 2) || (TRANSA == 4)
    length  = i;
    if (length > k) length = k;

    if (length > 0) {
#if TRANSA == 2
      temp = DOTU_K(length, a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1);
#else
      temp = DOTC_K(length, a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1);
#endif

      B[i * 2 + 0] -= CREAL(temp);
      B[i * 2 + 1] -= CIMAG(temp);
    }
#endif

#ifndef UNIT
#if (TRANSA == 1) || (TRANSA == 3)
    ar = a[0];
    ai = a[1];
#else
    ar = a[k * 2 + 0];
    ai = a[k * 2 + 1];
#endif

    if (fabs(ar) >= fabs(ai)){
      ratio = ai / ar;
      den = 1./(ar * ( 1 + ratio * ratio));

      ar =  den;
#if TRANSA < 3
      ai = -ratio * den;
#else
      ai =  ratio * den;
#endif
    } else {
      ratio = ar / ai;
      den = 1./(ai * ( 1 + ratio * ratio));
      ar =  ratio * den;
#if TRANSA < 3
      ai = -den;
#else
      ai =  den;
#endif
    }

    br = B[i * 2 + 0];
    bi = B[i * 2 + 1];

    B[i * 2 + 0] = ar*br - ai*bi;
    B[i * 2 + 1] = ar*bi + ai*br;
#endif

#if (TRANSA == 1) || (TRANSA == 3)
    length  = n - i - 1;
    if (length > k) length = k;

    if (length > 0) {
#if   TRANSA == 1
      AXPYU_K(length, 0, 0,
	      -B[i * 2 + 0],  -B[i * 2 + 1],
	      a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1, NULL, 0);
#else
      AXPYC_K(length, 0, 0,
	      -B[i * 2 + 0],  -B[i * 2 + 1],
	      a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1, NULL, 0);
#endif
    }
#endif

    a += lda * COMPSIZE;
  }

  if (incb != 1) {
    COPY_K(n, buffer, 1, b, incb);
  }

  return 0;
}