コード例 #1
0
ファイル: lauum_U_parallel.c プロジェクト: AmEv7Fam/opentoonz
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {

  BLASLONG n, bk, i, blocking, lda;
  int mode;
  blas_arg_t newarg;
  FLOAT *a;
  FLOAT alpha[2] = { ONE, ZERO};

#ifndef COMPLEX
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif  
#else
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif  
#endif

  if (args -> nthreads  == 1) {
    LAUUM_U_SINGLE(args, NULL, NULL, sa, sb, 0); 
    return 0;
  }

  n  = args -> n;
  a  = (FLOAT *)args -> a;
  lda = args -> lda;

  if (range_n) n  = range_n[1] - range_n[0];

  if (n <= GEMM_UNROLL_N * 2) {
    LAUUM_U_SINGLE(args, NULL, range_n, sa, sb, 0);
    return 0;
  }

  newarg.lda = lda;
  newarg.ldb = lda;
  newarg.ldc = lda;
  newarg.alpha = alpha;
  newarg.beta = NULL;
  newarg.nthreads = args -> nthreads;

  blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
  if (blocking > GEMM_Q) blocking = GEMM_Q;

  for (i = 0; i < n; i += blocking) {

    bk = n - i;
    if (bk > blocking) bk = blocking;
    
    newarg.n = i;
    newarg.k = bk;
    newarg.a = a + (    i * lda) * COMPSIZE;
    newarg.c = a;

    syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T,
		&newarg, NULL, NULL, (void *)HERK_UN, sa, sb, args -> nthreads);

    newarg.m = i;
    newarg.n = bk;
    newarg.a = a + (i + i * lda) * COMPSIZE;
    newarg.b = a + (    i * lda) * COMPSIZE;

    gemm_thread_m(mode | BLAS_TRANSA_T | BLAS_RSIDE,
		  &newarg, NULL, NULL, (void *)TRMM_RCUN, sa, sb, args -> nthreads);

    newarg.m = bk;
    newarg.n = bk;
    newarg.a = a + (i + i * lda) * COMPSIZE;

    CNAME(&newarg, NULL, NULL, sa, sb, 0);
  }
  
  return 0;
}
コード例 #2
0
ファイル: trtri_U_parallel.c プロジェクト: 4ker/OpenBLAS
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) {

  BLASLONG n, info;
  BLASLONG bk, i, blocking;
  int mode;
  BLASLONG lda, range_N[2];
  blas_arg_t newarg;
  FLOAT *a;
  FLOAT alpha[2] = { ONE, ZERO};
  FLOAT beta [2] = {-ONE, ZERO};

#ifndef COMPLEX
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif
#else
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif
#endif

  n  = args -> n;
  a  = (FLOAT *)args -> a;
  lda = args -> lda;

  if (range_n) n  = range_n[1] - range_n[0];

  if (n <= DTB_ENTRIES) {
    info = TRTI2(args, NULL, range_n, sa, sb, 0);
    return info;
  }

  blocking = GEMM_Q;
  if (n < 4 * GEMM_Q) blocking = (n + 3) / 4;

  for (i = 0; i < n; i += blocking) {
    bk = n - i;
    if (bk > blocking) bk = blocking;

    range_N[0] = i;
    range_N[1] = i + bk;

    newarg.lda = lda;
    newarg.ldb = lda;
    newarg.ldc = lda;
    newarg.alpha = alpha;

    newarg.m = i;
    newarg.n = bk;
    newarg.a = a + (i + i * lda) * COMPSIZE;
    newarg.b = a + (    i * lda) * COMPSIZE;

    newarg.beta  = beta;
    newarg.nthreads = args -> nthreads;

    gemm_thread_m(mode, &newarg, NULL, NULL, TRSM, sa, sb, args -> nthreads);

    newarg.m = bk;
    newarg.n = bk;

    newarg.a = a + (i + i * lda) * COMPSIZE;

    CNAME  (&newarg, NULL, NULL, sa, sb, 0);

    newarg.m = i;
    newarg.n = n - i - bk;
    newarg.k = bk;

    newarg.a = a + (     i       * lda) * COMPSIZE;
    newarg.b = a + (i + (i + bk) * lda) * COMPSIZE;
    newarg.c = a + (    (i + bk) * lda) * COMPSIZE;

    newarg.beta  = NULL;

    gemm_thread_n(mode, &newarg, NULL, NULL, GEMM_NN, sa, sb, args -> nthreads);

    newarg.a = a + (i +  i       * lda) * COMPSIZE;
    newarg.b = a + (i + (i + bk) * lda) * COMPSIZE;

    newarg.m = bk;
    newarg.n = n - i - bk;

    gemm_thread_n(mode, &newarg, NULL, NULL, TRMM, sa, sb, args -> nthreads);

  }

  return 0;
}
コード例 #3
0
ファイル: potrf_L_parallel.c プロジェクト: 4ker/OpenBLAS
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {

  BLASLONG n, bk, i, blocking, lda;
  BLASLONG info;
  int mode;
  blas_arg_t newarg;
  FLOAT *a;
  FLOAT alpha[2] = { -ONE, ZERO};

#ifndef COMPLEX
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif
#else
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif
#endif

  if (args -> nthreads  == 1) {
    info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0);
    return info;
  }

  n  = args -> n;
  a  = (FLOAT *)args -> a;
  lda = args -> lda;

  if (range_n) n  = range_n[1] - range_n[0];

  if (n <= GEMM_UNROLL_N * 4) {
    info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0);
    return info;
  }

  newarg.lda = lda;
  newarg.ldb = lda;
  newarg.ldc = lda;
  newarg.alpha = alpha;
  newarg.beta = NULL;
  newarg.nthreads = args -> nthreads;

  blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
  if (blocking > GEMM_Q) blocking = GEMM_Q;

  for (i = 0; i < n; i += blocking) {
    bk = n - i;
    if (bk > blocking) bk = blocking;

    newarg.m = bk;
    newarg.n = bk;
    newarg.a = a + (i + i * lda) * COMPSIZE;

    info = CNAME(&newarg, NULL, NULL, sa, sb, 0);
    if (info) return info + i;

    if (n - i - bk > 0) {
      newarg.m = n - i - bk;
      newarg.n = bk;
      newarg.a = a + (i      + i * lda) * COMPSIZE;
      newarg.b = a + (i + bk + i * lda) * COMPSIZE;

      gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO,
		    &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads);

      newarg.n = n - i - bk;
      newarg.k = bk;
      newarg.a = a + (i + bk +  i       * lda) * COMPSIZE;
      newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE;

#ifndef USE_SIMPLE_THREADED_LEVEL3
      HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0);
#else
      syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO,
		  &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads);
#endif
    }
  }

  return 0;
}