// Tensors must be sorted from most to least significant dimension. // nc is the number of contracted indices, and // ind is an array of (nx by 2) integers (pairs of contracted indices). Tensor *tensdot(Tensor *a, Tensor *b, int nc, int *ind, int nref, Slice *m) { int i = 0; int ord_a, ord_b; int *anum, *bnum; int an, bn; Tensor *t; #ifdef PARANOID if(nc > a->n || nc > b->n) { fprintf(stderr, "tensdor: more contracted than input dims.\n"); exit(1); } #endif anum = number_indices(a->n, nc, ind); bnum = number_indices(b->n, nc, ind+1); t = tensor_ctor(a->n + b->n - 2*nc); an = copy_shape(a, anum, t->shape); bn = copy_shape(b, bnum, t->shape + a->n - nc); t->len = an*bn; t->b = reserve_block(m, nref, t->len*sizeof(float)); if(nc == 0) { // trivial case A <- a X Y^T + A, A m x n GER(bn, an, 1.0, b->b->x, 1, a->b->x, 1, t->b->x, an); //GER(bn, an, a->scale*b->scale, b->b->x, 1, a->b->x, 1, // t->b->x, an); free(anum); free(bnum); return t; } if( (ord_a = ck_ordered(a->n, anum))) { if( (ord_b = ck_ordered(b->n, bnum)) && ck_same(nc, ind)) { // Straightforward dgemm. GEMM(ord_b == 1, ord_a == -1, bn, an, a->len / an, 1.0, a->b->x, an, b->b->x, bn, 0.0, t->b->x, bn); //GEMM(ord_b == 1, ord_a == -1, bn, an, a->len / an, // a->scale*b->scale, a->b->x, an, b->b->x, bn, // 0.0, t->b->x, bn); // TODO: decide if transposing A would give better perf. } else { // Need to transpose B. printf("Need to transpose B.\n"); } } else if( (ord_b = ck_ordered(b->n, bnum))) { // Need to transpose A printf("Need to transpose A.\n"); } else { // Need to transpose both A and B. printf("Need to transpose A & B.\n"); } free(anum); free(bnum); return t; }
int main(int argc, char *argv[]){ FLOAT *a, *x, *y; FLOAT alpha[] = {1.0, 1.0}; blasint m, i, j; blasint inc_x=1,inc_y=1; blasint n=0; int has_param_n = 0; int loops = 1; int l; char *p; int from = 1; int to = 200; int step = 1; struct timeval start, stop; double time1,timeg; argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); if ((p = getenv("OPENBLAS_PARAM_N"))) { n = atoi(p); if ((n>0) && (n<=to)) has_param_n = 1; } if ( has_param_n == 1 ) fprintf(stderr, "From : %3d To : %3d Step = %3d N = %d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,n,inc_x,inc_y,loops); else fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ fprintf(stderr,"Out of Memory!!\n");exit(1); } #ifdef linux srandom(getpid()); #endif fprintf(stderr, " SIZE Flops\n"); for(m = from; m <= to; m += step) { timeg=0; if ( has_param_n == 0 ) n = m; fprintf(stderr, " %6dx%d : ", (int)m,(int)n); for(j = 0; j < m; j++){ for(i = 0; i < n * COMPSIZE; i++){ a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } for (l=0; l<loops; l++) { gettimeofday( &start, (struct timezone *)0); GER (&m, &n, alpha, x, &inc_x, y, &inc_y, a , &m); gettimeofday( &stop, (struct timezone *)0); time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; timeg += time1; } timeg /= loops; fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6); } return 0; }
void NAME(blasint *M, blasint *N, FLOAT *Alpha, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a, blasint *LDA){ blasint m = *M; blasint n = *N; FLOAT alpha = *Alpha; blasint incx = *INCX; blasint incy = *INCY; blasint lda = *LDA; FLOAT *buffer; #ifdef SMP int nthreads; #endif blasint info; PRINT_DEBUG_NAME; info = 0; if (lda < MAX(1,m)) info = 9; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (m < 0) info = 1; if (info){ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, blasint m, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) { FLOAT *buffer; blasint info, t; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; info = 0; if (order == CblasColMajor) { info = -1; if (lda < MAX(1,m)) info = 9; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (m < 0) info = 1; } if (order == CblasRowMajor) { info = -1; t = n; n = m; m = t; t = incx; incx = incy; incy = t; buffer = x; x = y; y = buffer; if (lda < MAX(1,m)) info = 9; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (m < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif /* Quick return if possible. */ if (m == 0 || n == 0) return; if (alpha == 0.) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incy < 0) y -= (n - 1) * incy; if (incx < 0) x -= (m - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif GER(m, n, 0, alpha, x, incx, y, incy, a, lda, buffer); #ifdef SMP } else { GER_THREAD(m, n, alpha, x, incx, y, incy, a, lda, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); IDEBUG_END; return; }