Exemple #1
0
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {

  BLASLONG m, n, mn, lda, offset;
  BLASLONG init_bk, next_bk, range_n_mine[2], range_n_new[2];
  blasint *ipiv, iinfo, info;
  int mode;
  blas_arg_t newarg;

  FLOAT *a, *sbb;
  FLOAT dummyalpha[2] = {ZERO, ZERO};

  blas_queue_t queue[MAX_CPU_NUMBER];

  BLASLONG range_M[MAX_CPU_NUMBER + 1];
  BLASLONG range_N[MAX_CPU_NUMBER + 1];

  job_t        job[MAX_CPU_NUMBER];

  BLASLONG width, nn, mm;
  BLASLONG i, j, k, is, bk;

  BLASLONG num_cpu;

  volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));

#ifndef COMPLEX
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif  
#else
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif  
#endif

  m    = args -> m;
  n    = args -> n;
  a    = (FLOAT *)args -> a;
  lda  = args -> lda;
  ipiv = (blasint *)args -> c;
  offset = 0;

  if (range_n) {
    m     -= range_n[0];
    n      = range_n[1] - range_n[0];
    offset = range_n[0];
    a     += range_n[0] * (lda + 1) * COMPSIZE;
  }

  if (m <= 0 || n <= 0) return 0;
  
  newarg.c   = ipiv;
  newarg.lda = lda;
  newarg.common   = (void *)job;

  info = 0;

  mn = MIN(m, n);

  init_bk = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
  if (init_bk > GEMM_Q) init_bk = GEMM_Q;

  if (init_bk <= GEMM_UNROLL_N) {
    info = GETF2(args, NULL, range_n, sa, sb, 0);
    return info;
  }

  next_bk = init_bk;

  bk = mn;
  if (bk > next_bk) bk = next_bk;
  
  range_n_new[0] = offset;
  range_n_new[1] = offset + bk;
  
  iinfo   = CNAME(args, NULL, range_n_new, sa, sb, 0);
  
  if (iinfo && !info) info = iinfo;
  
  TRSM_ILTCOPY(bk, bk, a, lda, 0, sb);

  sbb = (FLOAT *)((((long)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
  
  is = 0;
  num_cpu = 0;

  while (is < mn) {
    
    width  = (FORMULA1(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
    if (width > mn - is - bk) width = mn - is - bk;

    if (width < bk) {
      next_bk = (FORMULA2(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N) & ~(GEMM_UNROLL_N - 1);
      
      if (next_bk > bk) next_bk = bk;

      width = next_bk;
      if (width > mn - is - bk) width = mn - is - bk;
    }
    
    if (num_cpu > 0) exec_blas_async_wait(num_cpu, &queue[0]);

    mm = m - bk - is;
    nn = n - bk - is;

    newarg.a   = sb;
    newarg.b   = a + (is + is * lda) * COMPSIZE;
    newarg.d   = (void *)flag;
    newarg.m   = mm;
    newarg.n   = nn;
    newarg.k   = bk;
    newarg.ldb = is + offset;
    
    nn -= width;

    range_n_mine[0] = 0;
    range_n_mine[1] = width;

    range_N[0] = width;
    range_M[0] = 0;

    num_cpu  = 0;
    
    while (nn > 0){
      
      if (mm >= nn) {

	width  = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
	if (nn < width) width = nn;
	nn -= width;
	range_N[num_cpu + 1] = range_N[num_cpu] + width;
	
	width  = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
	if (mm < width) width = mm;
	if (nn <=    0) width = mm;
	mm -= width;
	range_M[num_cpu + 1] = range_M[num_cpu] + width;

      } else {

	width  = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
	if (mm < width) width = mm;
	mm -= width;
	range_M[num_cpu + 1] = range_M[num_cpu] + width;

	width  = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
	if (nn < width) width = nn;
	if (mm <=    0) width = nn;
	nn -= width;
	range_N[num_cpu + 1] = range_N[num_cpu] + width;
	
      }

      queue[num_cpu].mode    = mode;
      queue[num_cpu].routine = inner_advanced_thread;
      queue[num_cpu].args    = &newarg;
      queue[num_cpu].range_m = &range_M[num_cpu];
      queue[num_cpu].range_n = &range_N[0];
      queue[num_cpu].sa      = NULL;
      queue[num_cpu].sb      = NULL;
      queue[num_cpu].next    = &queue[num_cpu + 1];
      flag[num_cpu * CACHE_LINE_SIZE] = 1;
      
      num_cpu ++;

    }
    
    newarg.nthreads = num_cpu;
    
    if (num_cpu > 0) {
      for (j = 0; j < num_cpu; j++) {
	for (i = 0; i < num_cpu; i++) {
	  for (k = 0; k < DIVIDE_RATE; k++) {
	    job[j].working[i][CACHE_LINE_SIZE * k] = 0;
	  }
	}
      }
    }

    is += bk;

    bk = mn - is;
    if (bk > next_bk) bk = next_bk;
    
    range_n_new[0] = offset + is;
    range_n_new[1] = offset + is + bk;

    if (num_cpu > 0) {

      queue[num_cpu - 1].next = NULL;
      
      exec_blas_async(0, &queue[0]);
      
      inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1);
      
      iinfo   = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);
      
      if (iinfo && !info) info = iinfo + is;

      for (i = 0; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {};

      TRSM_ILTCOPY(bk, bk, a + (is +  is * lda) * COMPSIZE, lda, 0, sb);

    } else {

      inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1);

      iinfo   = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);

      if (iinfo && !info) info = iinfo + is;
    
    }
    
  }
  
  next_bk = init_bk;
  is = 0;
  
  while (is < mn) {
    
    bk = mn - is;
    if (bk > next_bk) bk = next_bk;
    
    width  = (FORMULA1(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
    if (width > mn - is - bk) width = mn - is - bk;

    if (width < bk) {
      next_bk = (FORMULA2(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N) & ~(GEMM_UNROLL_N - 1);
      if (next_bk > bk) next_bk = bk;
    }

    blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, 
		       a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0,
		       ipiv, 1, (void *)LASWP_PLUS, args -> nthreads);
    
    is += bk;
  }
  
  return info;
}
Exemple #2
0
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {

  BLASLONG m, n, mn, lda, offset;
  BLASLONG i, is, bk, init_bk, next_bk, range_n_new[2];
  blasint *ipiv, iinfo, info;
  int mode;
  blas_arg_t newarg;
  FLOAT *a, *sbb;
  FLOAT dummyalpha[2] = {ZERO, ZERO};

  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range[MAX_CPU_NUMBER + 1];

  BLASLONG width, nn, num_cpu;

  volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));

#ifndef COMPLEX
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif  
#else
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif  
#endif

  m    = args -> m;
  n    = args -> n;
  a    = (FLOAT *)args -> a;
  lda  = args -> lda;
  ipiv = (blasint *)args -> c;
  offset = 0;

  if (range_n) {
    m     -= range_n[0];
    n      = range_n[1] - range_n[0];
    offset = range_n[0];
    a     += range_n[0] * (lda + 1) * COMPSIZE;
  }

  if (m <= 0 || n <= 0) return 0;
  
  newarg.c   = ipiv;
  newarg.lda = lda;
  newarg.common = NULL;
  newarg.nthreads = args -> nthreads;

  mn = MIN(m, n);

  init_bk = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
  if (init_bk > GEMM_Q) init_bk = GEMM_Q;

  if (init_bk <= GEMM_UNROLL_N) {
    info = GETF2(args, NULL, range_n, sa, sb, 0);
    return info;
  }

  width = FORMULA1(m, n, 0, init_bk, args -> nthreads);
  width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
  if (width > n - init_bk) width = n - init_bk;

  if (width < init_bk) {
    long temp;

    temp = FORMULA2(m, n, 0, init_bk, args -> nthreads);
    temp = (temp + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);

    if (temp < GEMM_UNROLL_N) temp = GEMM_UNROLL_N;
    if (temp < init_bk) init_bk = temp;

  }

  next_bk = init_bk;
  bk      = init_bk;

  range_n_new[0] = offset;
  range_n_new[1] = offset + bk;
  
  info   = CNAME(args, NULL, range_n_new, sa, sb, 0);
  
  TRSM_ILTCOPY(bk, bk, a, lda, 0, sb);

  is = 0;
  num_cpu = 0;

  sbb = (FLOAT *)((((long)(sb + GEMM_PQ * GEMM_PQ * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);

  while (is < mn) {

    width  = FORMULA1(m, n, is, bk, args -> nthreads);
    width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
    
    if (width < bk) {

      next_bk = FORMULA2(m, n, is, bk, args -> nthreads);
      next_bk = (next_bk + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);

      if (next_bk > bk) next_bk = bk;
#if 0
      if (next_bk < GEMM_UNROLL_N) next_bk = MIN(GEMM_UNROLL_N, mn - bk - is);
#else
      if (next_bk < GEMM_UNROLL_N) next_bk = MAX(GEMM_UNROLL_N, mn - bk - is);
#endif

      width = next_bk;
    }
    
    if (width > mn - is - bk) {
      next_bk = mn - is - bk;
      width   = next_bk;
    }

    nn = n - bk - is;
    if (width > nn) width = nn;

    if (num_cpu > 1)  exec_blas_async_wait(num_cpu - 1, &queue[1]);

    range[0] = 0;
    range[1] = width;
    
    num_cpu = 1;
    nn -= width;
    
    newarg.a   = sb;
    newarg.b   = a + (is + is * lda) * COMPSIZE;
    newarg.d   = (void *)flag;
    newarg.m   = m - bk - is;
    newarg.n   = n - bk - is;
    newarg.k   = bk;
    newarg.ldb = is + offset;
    
    while (nn > 0){
      
      width  = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu);
      
      nn -= width;
      if (nn < 0) width = width + nn;
      
      range[num_cpu + 1] = range[num_cpu] + width;
      
      queue[num_cpu].mode    = mode;
      //queue[num_cpu].routine = inner_advanced_thread;
      queue[num_cpu].routine = (void *)inner_basic_thread;
      queue[num_cpu].args    = &newarg;
      queue[num_cpu].range_m = NULL;
      queue[num_cpu].range_n = &range[num_cpu];
      queue[num_cpu].sa      = NULL;
      queue[num_cpu].sb      = NULL;
      queue[num_cpu].next    = &queue[num_cpu + 1];
      flag[num_cpu * CACHE_LINE_SIZE] = 1;

      num_cpu ++;
    }
    
    queue[num_cpu - 1].next = NULL;

    is += bk;
    
    bk = n - is;
    if (bk > next_bk) bk = next_bk;
    
    range_n_new[0] = offset + is;
    range_n_new[1] = offset + is + bk;
    
    if (num_cpu > 1) {

      exec_blas_async(1, &queue[1]);
    
#if 0
      inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, 0);

      iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);
#else

      if (range[1] >= bk * 4) {

	BLASLONG myrange[2];

	myrange[0] = 0;
	myrange[1] = bk;

	inner_basic_thread(&newarg, NULL, &myrange[0], sa, sbb, -1);

	iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);

	myrange[0] = bk;
	myrange[1] = range[1];

	inner_basic_thread(&newarg, NULL, &myrange[0], sa, sbb, -1);

      } else {

	inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, -1);

	iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);
      }

#endif

      for (i = 1; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {};
      
      TRSM_ILTCOPY(bk, bk, a + (is +  is * lda) * COMPSIZE, lda, 0, sb);
      
    } else {

      inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, -1);
      
      iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);
    }

      if (iinfo && !info) info = iinfo + is;
      
  }
  
  next_bk = init_bk;
  bk      = init_bk;
  
  is = 0;
  
  while (is < mn) {
    
    bk = mn - is;
    if (bk > next_bk) bk = next_bk;
    
    width  = FORMULA1(m, n, is, bk, args -> nthreads);
    width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);

    if (width < bk) {
      next_bk = FORMULA2(m, n, is, bk, args -> nthreads);
      next_bk = (next_bk + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);

      if (next_bk > bk) next_bk = bk;
#if 0
      if (next_bk < GEMM_UNROLL_N) next_bk = MIN(GEMM_UNROLL_N, mn - bk - is);
#else
      if (next_bk < GEMM_UNROLL_N) next_bk = MAX(GEMM_UNROLL_N, mn - bk - is);
#endif
    }

    if (width > mn - is - bk) {
      next_bk = mn - is - bk;
      width   = next_bk;
    }

    blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, 
		       a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0,
		       ipiv, 1, (void *)LASWP_PLUS, args -> nthreads);
    
    is += bk;
  }
  
  return info;
}
Exemple #3
0
int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv,
         FLOAT *b, blasint *ldB, blasint *Info){

  blas_arg_t args;

  blasint info;
  FLOAT *buffer;
#ifdef PPC440
  extern
#endif
  FLOAT *sa, *sb;

  PRINT_DEBUG_NAME;

  args.m    = *N;
  args.n    = *NRHS;
  args.a    = (void *)a;
  args.lda  = *ldA;
  args.b    = (void *)b;
  args.ldb  = *ldB;
  args.c    = (void *)ipiv;

  info  = 0;
  if (args.ldb < MAX(1,args.m)) info = 7;
  if (args.lda < MAX(1,args.m)) info = 4;
  if (args.n   < 0)             info = 2;
  if (args.m   < 0)             info = 1;

  if (info) {
    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
    *Info = - info;
    return 0;
  }

  args.alpha = NULL;
  args.beta  = NULL;

  *Info = 0;

  if (args.m == 0 || args.n == 0) return 0;

  IDEBUG_START;

  FUNCTION_PROFILE_START();

#ifndef PPC440
  buffer = (FLOAT *)blas_memory_alloc(1);

  sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A);
  sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
#endif

#ifdef SMP
  args.common = NULL;
  args.nthreads = num_cpu_avail(4);

  if (args.nthreads == 1) {
#endif

    args.n    = *N;
    info = GETRF_SINGLE(&args, NULL, NULL, sa, sb, 0);

    if (info == 0){
      args.n    = *NRHS;
      GETRS_N_SINGLE(&args, NULL, NULL, sa, sb, 0);
    }

#ifdef SMP
  } else {

    args.n    = *N;
    info = GETRF_PARALLEL(&args, NULL, NULL, sa, sb, 0);

    if (info == 0){
      args.n    = *NRHS;
      GETRS_N_PARALLEL(&args, NULL, NULL, sa, sb, 0);
    }
  }
#endif

#ifndef PPC440
  blas_memory_free(buffer);
#endif

  *Info = info;

  FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, *N * *N, 2. / 3. * *N * *N * *N + *N * *N);

  IDEBUG_END;

  return 0;
}