コード例 #1
0
ファイル: getrf_parallel.c プロジェクト: 34985086/meshlab
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {

  BLASLONG m, n, mn, lda, offset;
  BLASLONG i, is, bk, init_bk, next_bk, range_n_new[2];
  blasint *ipiv, iinfo, info;
  int mode;
  blas_arg_t newarg;
  FLOAT *a, *sbb;
  FLOAT dummyalpha[2] = {ZERO, ZERO};

  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range[MAX_CPU_NUMBER + 1];

  BLASLONG width, nn, num_cpu;

  volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));

#ifndef COMPLEX
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif  
#else
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif  
#endif

  m    = args -> m;
  n    = args -> n;
  a    = (FLOAT *)args -> a;
  lda  = args -> lda;
  ipiv = (blasint *)args -> c;
  offset = 0;

  if (range_n) {
    m     -= range_n[0];
    n      = range_n[1] - range_n[0];
    offset = range_n[0];
    a     += range_n[0] * (lda + 1) * COMPSIZE;
  }

  if (m <= 0 || n <= 0) return 0;
  
  newarg.c   = ipiv;
  newarg.lda = lda;
  newarg.common = NULL;
  newarg.nthreads = args -> nthreads;

  mn = MIN(m, n);

  init_bk = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
  if (init_bk > GEMM_Q) init_bk = GEMM_Q;

  if (init_bk <= GEMM_UNROLL_N) {
    info = GETF2(args, NULL, range_n, sa, sb, 0);
    return info;
  }

  width = FORMULA1(m, n, 0, init_bk, args -> nthreads);
  width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
  if (width > n - init_bk) width = n - init_bk;

  if (width < init_bk) {
    long temp;

    temp = FORMULA2(m, n, 0, init_bk, args -> nthreads);
    temp = (temp + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);

    if (temp < GEMM_UNROLL_N) temp = GEMM_UNROLL_N;
    if (temp < init_bk) init_bk = temp;

  }

  next_bk = init_bk;
  bk      = init_bk;

  range_n_new[0] = offset;
  range_n_new[1] = offset + bk;
  
  info   = CNAME(args, NULL, range_n_new, sa, sb, 0);
  
  TRSM_ILTCOPY(bk, bk, a, lda, 0, sb);

  is = 0;
  num_cpu = 0;

  sbb = (FLOAT *)((((long)(sb + GEMM_PQ * GEMM_PQ * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);

  while (is < mn) {

    width  = FORMULA1(m, n, is, bk, args -> nthreads);
    width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
    
    if (width < bk) {

      next_bk = FORMULA2(m, n, is, bk, args -> nthreads);
      next_bk = (next_bk + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);

      if (next_bk > bk) next_bk = bk;
#if 0
      if (next_bk < GEMM_UNROLL_N) next_bk = MIN(GEMM_UNROLL_N, mn - bk - is);
#else
      if (next_bk < GEMM_UNROLL_N) next_bk = MAX(GEMM_UNROLL_N, mn - bk - is);
#endif

      width = next_bk;
    }
    
    if (width > mn - is - bk) {
      next_bk = mn - is - bk;
      width   = next_bk;
    }

    nn = n - bk - is;
    if (width > nn) width = nn;

    if (num_cpu > 1)  exec_blas_async_wait(num_cpu - 1, &queue[1]);

    range[0] = 0;
    range[1] = width;
    
    num_cpu = 1;
    nn -= width;
    
    newarg.a   = sb;
    newarg.b   = a + (is + is * lda) * COMPSIZE;
    newarg.d   = (void *)flag;
    newarg.m   = m - bk - is;
    newarg.n   = n - bk - is;
    newarg.k   = bk;
    newarg.ldb = is + offset;
    
    while (nn > 0){
      
      width  = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu);
      
      nn -= width;
      if (nn < 0) width = width + nn;
      
      range[num_cpu + 1] = range[num_cpu] + width;
      
      queue[num_cpu].mode    = mode;
      //queue[num_cpu].routine = inner_advanced_thread;
      queue[num_cpu].routine = (void *)inner_basic_thread;
      queue[num_cpu].args    = &newarg;
      queue[num_cpu].range_m = NULL;
      queue[num_cpu].range_n = &range[num_cpu];
      queue[num_cpu].sa      = NULL;
      queue[num_cpu].sb      = NULL;
      queue[num_cpu].next    = &queue[num_cpu + 1];
      flag[num_cpu * CACHE_LINE_SIZE] = 1;

      num_cpu ++;
    }
    
    queue[num_cpu - 1].next = NULL;

    is += bk;
    
    bk = n - is;
    if (bk > next_bk) bk = next_bk;
    
    range_n_new[0] = offset + is;
    range_n_new[1] = offset + is + bk;
    
    if (num_cpu > 1) {

      exec_blas_async(1, &queue[1]);
    
#if 0
      inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, 0);

      iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);
#else

      if (range[1] >= bk * 4) {

	BLASLONG myrange[2];

	myrange[0] = 0;
	myrange[1] = bk;

	inner_basic_thread(&newarg, NULL, &myrange[0], sa, sbb, -1);

	iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);

	myrange[0] = bk;
	myrange[1] = range[1];

	inner_basic_thread(&newarg, NULL, &myrange[0], sa, sbb, -1);

      } else {

	inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, -1);

	iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);
      }

#endif

      for (i = 1; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {};
      
      TRSM_ILTCOPY(bk, bk, a + (is +  is * lda) * COMPSIZE, lda, 0, sb);
      
    } else {

      inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, -1);
      
      iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);
    }

      if (iinfo && !info) info = iinfo + is;
      
  }
  
  next_bk = init_bk;
  bk      = init_bk;
  
  is = 0;
  
  while (is < mn) {
    
    bk = mn - is;
    if (bk > next_bk) bk = next_bk;
    
    width  = FORMULA1(m, n, is, bk, args -> nthreads);
    width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);

    if (width < bk) {
      next_bk = FORMULA2(m, n, is, bk, args -> nthreads);
      next_bk = (next_bk + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);

      if (next_bk > bk) next_bk = bk;
#if 0
      if (next_bk < GEMM_UNROLL_N) next_bk = MIN(GEMM_UNROLL_N, mn - bk - is);
#else
      if (next_bk < GEMM_UNROLL_N) next_bk = MAX(GEMM_UNROLL_N, mn - bk - is);
#endif
    }

    if (width > mn - is - bk) {
      next_bk = mn - is - bk;
      width   = next_bk;
    }

    blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, 
		       a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0,
		       ipiv, 1, (void *)LASWP_PLUS, args -> nthreads);
    
    is += bk;
  }
  
  return info;
}
コード例 #2
0
ファイル: axpy.c プロジェクト: 4ker/OpenBLAS
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){

  BLASLONG n    = *N;
  BLASLONG incx = *INCX;
  BLASLONG incy = *INCY;
  FLOAT alpha = *ALPHA;

#else

void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy){

#endif

#ifdef SMP
  int mode, nthreads;
#endif

#ifndef CBLAS
  PRINT_DEBUG_NAME;
#else
  PRINT_DEBUG_CNAME;
#endif

  if (n <= 0) return;

  if (alpha == ZERO) return;

  IDEBUG_START;

  FUNCTION_PROFILE_START();

  if (incx < 0) x -= (n - 1) * incx;
  if (incy < 0) y -= (n - 1) * incy;

#ifdef SMP
  nthreads = num_cpu_avail(1);

  //disable multi-thread when incx==0 or incy==0
  //In that case, the threads would be dependent.
  if (incx == 0 || incy == 0)
	  nthreads = 1;

  //Temporarily work-around the low performance issue with small imput size &
  //multithreads.
  if (n <= 10000)
	  nthreads = 1;

  if (nthreads == 1) {
#endif

  AXPYU_K(n, 0, 0, alpha, x, incx, y, incy, NULL, 0);

#ifdef SMP
  } else {

#ifdef XDOUBLE
    mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
    mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
    mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif

    blas_level1_thread(mode, n, 0, 0, &alpha,
		       x, incx, y, incy, NULL, 0, (void *)AXPYU_K, nthreads);

  }
#endif

  FUNCTION_PROFILE_END(1, 2 * n, 2 * n);

  IDEBUG_END;

  return;

}
コード例 #3
0
ファイル: getrf_parallel.c プロジェクト: 34985086/meshlab
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {

  BLASLONG m, n, mn, lda, offset;
  BLASLONG init_bk, next_bk, range_n_mine[2], range_n_new[2];
  blasint *ipiv, iinfo, info;
  int mode;
  blas_arg_t newarg;

  FLOAT *a, *sbb;
  FLOAT dummyalpha[2] = {ZERO, ZERO};

  blas_queue_t queue[MAX_CPU_NUMBER];

  BLASLONG range_M[MAX_CPU_NUMBER + 1];
  BLASLONG range_N[MAX_CPU_NUMBER + 1];

  job_t        job[MAX_CPU_NUMBER];

  BLASLONG width, nn, mm;
  BLASLONG i, j, k, is, bk;

  BLASLONG num_cpu;

  volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));

#ifndef COMPLEX
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif  
#else
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif  
#endif

  m    = args -> m;
  n    = args -> n;
  a    = (FLOAT *)args -> a;
  lda  = args -> lda;
  ipiv = (blasint *)args -> c;
  offset = 0;

  if (range_n) {
    m     -= range_n[0];
    n      = range_n[1] - range_n[0];
    offset = range_n[0];
    a     += range_n[0] * (lda + 1) * COMPSIZE;
  }

  if (m <= 0 || n <= 0) return 0;
  
  newarg.c   = ipiv;
  newarg.lda = lda;
  newarg.common   = (void *)job;

  info = 0;

  mn = MIN(m, n);

  init_bk = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
  if (init_bk > GEMM_Q) init_bk = GEMM_Q;

  if (init_bk <= GEMM_UNROLL_N) {
    info = GETF2(args, NULL, range_n, sa, sb, 0);
    return info;
  }

  next_bk = init_bk;

  bk = mn;
  if (bk > next_bk) bk = next_bk;
  
  range_n_new[0] = offset;
  range_n_new[1] = offset + bk;
  
  iinfo   = CNAME(args, NULL, range_n_new, sa, sb, 0);
  
  if (iinfo && !info) info = iinfo;
  
  TRSM_ILTCOPY(bk, bk, a, lda, 0, sb);

  sbb = (FLOAT *)((((long)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
  
  is = 0;
  num_cpu = 0;

  while (is < mn) {
    
    width  = (FORMULA1(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
    if (width > mn - is - bk) width = mn - is - bk;

    if (width < bk) {
      next_bk = (FORMULA2(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N) & ~(GEMM_UNROLL_N - 1);
      
      if (next_bk > bk) next_bk = bk;

      width = next_bk;
      if (width > mn - is - bk) width = mn - is - bk;
    }
    
    if (num_cpu > 0) exec_blas_async_wait(num_cpu, &queue[0]);

    mm = m - bk - is;
    nn = n - bk - is;

    newarg.a   = sb;
    newarg.b   = a + (is + is * lda) * COMPSIZE;
    newarg.d   = (void *)flag;
    newarg.m   = mm;
    newarg.n   = nn;
    newarg.k   = bk;
    newarg.ldb = is + offset;
    
    nn -= width;

    range_n_mine[0] = 0;
    range_n_mine[1] = width;

    range_N[0] = width;
    range_M[0] = 0;

    num_cpu  = 0;
    
    while (nn > 0){
      
      if (mm >= nn) {

	width  = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
	if (nn < width) width = nn;
	nn -= width;
	range_N[num_cpu + 1] = range_N[num_cpu] + width;
	
	width  = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
	if (mm < width) width = mm;
	if (nn <=    0) width = mm;
	mm -= width;
	range_M[num_cpu + 1] = range_M[num_cpu] + width;

      } else {

	width  = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
	if (mm < width) width = mm;
	mm -= width;
	range_M[num_cpu + 1] = range_M[num_cpu] + width;

	width  = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
	if (nn < width) width = nn;
	if (mm <=    0) width = nn;
	nn -= width;
	range_N[num_cpu + 1] = range_N[num_cpu] + width;
	
      }

      queue[num_cpu].mode    = mode;
      queue[num_cpu].routine = inner_advanced_thread;
      queue[num_cpu].args    = &newarg;
      queue[num_cpu].range_m = &range_M[num_cpu];
      queue[num_cpu].range_n = &range_N[0];
      queue[num_cpu].sa      = NULL;
      queue[num_cpu].sb      = NULL;
      queue[num_cpu].next    = &queue[num_cpu + 1];
      flag[num_cpu * CACHE_LINE_SIZE] = 1;
      
      num_cpu ++;

    }
    
    newarg.nthreads = num_cpu;
    
    if (num_cpu > 0) {
      for (j = 0; j < num_cpu; j++) {
	for (i = 0; i < num_cpu; i++) {
	  for (k = 0; k < DIVIDE_RATE; k++) {
	    job[j].working[i][CACHE_LINE_SIZE * k] = 0;
	  }
	}
      }
    }

    is += bk;

    bk = mn - is;
    if (bk > next_bk) bk = next_bk;
    
    range_n_new[0] = offset + is;
    range_n_new[1] = offset + is + bk;

    if (num_cpu > 0) {

      queue[num_cpu - 1].next = NULL;
      
      exec_blas_async(0, &queue[0]);
      
      inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1);
      
      iinfo   = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);
      
      if (iinfo && !info) info = iinfo + is;

      for (i = 0; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {};

      TRSM_ILTCOPY(bk, bk, a + (is +  is * lda) * COMPSIZE, lda, 0, sb);

    } else {

      inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1);

      iinfo   = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);

      if (iinfo && !info) info = iinfo + is;
    
    }
    
  }
  
  next_bk = init_bk;
  is = 0;
  
  while (is < mn) {
    
    bk = mn - is;
    if (bk > next_bk) bk = next_bk;
    
    width  = (FORMULA1(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
    if (width > mn - is - bk) width = mn - is - bk;

    if (width < bk) {
      next_bk = (FORMULA2(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N) & ~(GEMM_UNROLL_N - 1);
      if (next_bk > bk) next_bk = bk;
    }

    blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, 
		       a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0,
		       ipiv, 1, (void *)LASWP_PLUS, args -> nthreads);
    
    is += bk;
  }
  
  return info;
}
コード例 #4
0
ファイル: zswap.c プロジェクト: dagss/gotoblas2
void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
  
  blasint n    = *N;
  blasint incx = *INCX;
  blasint incy = *INCY;

#else

void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){

#endif

#ifdef SMP
  int mode;
  FLOAT dummyalpha[2] = {ZERO, ZERO};
  int nthreads;
#endif

#ifndef CBLAS
  PRINT_DEBUG_NAME;
#else
  PRINT_DEBUG_CNAME;
#endif

  if (n <= 0) return;

  IDEBUG_START;

  FUNCTION_PROFILE_START();

  if (incx < 0) x -= (n - 1) * incx * 2;
  if (incy < 0) y -= (n - 1) * incy * 2;

#ifdef SMP
  nthreads = num_cpu_avail(1);

  if (nthreads == 1) {
#endif

  SWAP_K(n, 0, 0, ZERO, ZERO, x, incx, y, incy, NULL, 0);

#ifdef SMP
  } else {

#ifdef XDOUBLE
    mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
    mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
    mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif  
    
    blas_level1_thread(mode, n, 0, 0, dummyalpha,
		       x, incx, y, incy, NULL, 0, (void *)SWAP_K, nthreads);

  }
#endif

  FUNCTION_PROFILE_END(2, 2 * n, 0);

  IDEBUG_END;

  return;
  
}
コード例 #5
0
ファイル: zaxpy.c プロジェクト: 34985086/meshlab
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
  
  blasint n    = *N;
  blasint incx = *INCX;
  blasint incy = *INCY;

#else

void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint incy){

#endif

  FLOAT alpha_r = *(ALPHA + 0);
  FLOAT alpha_i = *(ALPHA + 1);

#ifdef SMP
  int mode, nthreads;
#endif

#ifndef CBLAS
  PRINT_DEBUG_CNAME;
#else
  PRINT_DEBUG_CNAME;
#endif

  if (n <= 0) return;

  if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;

  IDEBUG_START;

  FUNCTION_PROFILE_START();

  if (incx < 0) x -= (n - 1) * incx * 2;
  if (incy < 0) y -= (n - 1) * incy * 2;

#ifdef SMP
  nthreads = num_cpu_avail(1);

  if (nthreads == 1) {
#endif

#ifndef CONJ
    AXPYU_K (n, 0, 0, alpha_r, alpha_i, x, incx, y, incy, NULL, 0);
#else
    AXPYC_K(n, 0, 0, alpha_r, alpha_i, x, incx, y, incy, NULL, 0);
#endif
    
#ifdef SMP
  } else {
    
#ifdef XDOUBLE
    mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
    mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
    mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif  
    
    blas_level1_thread(mode, n, 0, 0, ALPHA, x, incx, y, incy, NULL, 0,
#ifndef CONJ
		       (void *)AXPYU_K,
#else
		       (void *)AXPYC_K,
#endif
		       nthreads);
  }
#endif
  
  FUNCTION_PROFILE_END(4, 2 * n, 2 * n);

  IDEBUG_END;

  return;
  
}
コード例 #6
0
ファイル: swap.c プロジェクト: AmEv7Fam/opentoonz
void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
  
  blasint n    = *N;
  blasint incx = *INCX;
  blasint incy = *INCY;

#else

void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){

#endif

#ifdef SMP
  int mode, nthreads;
  FLOAT dummyalpha[2] = {ZERO, ZERO};
#endif

#ifndef CBLAS
  PRINT_DEBUG_NAME;
#else
  PRINT_DEBUG_CNAME;
#endif

  if (n <= 0) return;

  IDEBUG_START;

  FUNCTION_PROFILE_START();

  if (incx < 0) x -= (n - 1) * incx;
  if (incy < 0) y -= (n - 1) * incy;

#ifdef SMP
  nthreads = num_cpu_avail(1);
  
  //disable multi-thread when incx==0 or incy==0
  //In that case, the threads would be dependent.
  if (incx == 0 || incy == 0)
	  nthreads = 1;
  
  if (nthreads == 1) {
#endif

    SWAP_K(n, 0, 0, ZERO, x, incx, y, incy, NULL, 0);

#ifdef SMP
  } else {
    
#ifdef XDOUBLE
    mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
    mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
    mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif  
    
    blas_level1_thread(mode, n, 0, 0, dummyalpha,
		       x, incx, y, incy, NULL, 0, (void *)SWAP_K, nthreads);
  }

#endif

  FUNCTION_PROFILE_END(1, 2 * n, 0);

  IDEBUG_END;

  return;
  
}
コード例 #7
0
ファイル: scal.c プロジェクト: 34985086/meshlab
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX){
  
  blasint n    = *N;
  blasint incx = *INCX;
  FLOAT alpha = *ALPHA;

#else

void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){
 
#endif

#ifdef SMP
  int mode, nthreads;
#endif

#ifndef CBLAS
  PRINT_DEBUG_NAME;
#else
  PRINT_DEBUG_CNAME;
#endif

  if (incx <= 0 || n <= 0) return;

  if (alpha == ONE) return;

  IDEBUG_START;

  FUNCTION_PROFILE_START();


#ifdef SMP
  nthreads = num_cpu_avail(1);

  if (nthreads == 1) {
#endif

  SCAL_K(n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0);

#ifdef SMP
  } else {

#ifdef DOUBLE
    mode  =  BLAS_DOUBLE | BLAS_REAL;
#else
    mode  =  BLAS_SINGLE | BLAS_REAL;
#endif  
    
    blas_level1_thread(mode, n, 0, 0,
#ifndef CBLAS
		       ALPHA, 
#else
		       &alpha,
#endif
		       x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads);

  }
#endif

  FUNCTION_PROFILE_END(1, n, n);

  IDEBUG_END;

  return;
  
}