//https://github.com/xianyi/OpenBLAS/issues/695 CTEST(potrf, bug_695){ openblas_complex_float A1[100] = {5.8525753+0.0*I, -0.79540455-0.7066077*I, 0.98274714-1.3824869*I, 2.619998-1.8532984*I, -1.8306153+1.2336911*I, 0.32275113-0.015575029*I, 2.1968813-1.0640624*I, 0.27894387-0.97911835*I, 3.0476584-0.18548489*I, 0.3842994-0.7050991*I, -0.79540455+0.7066077*I, 8.313246+0.0*I, -1.8076122+0.8882447*I, 0.47806996-0.48494184*I, 0.5096429+0.5395974*I, -0.7285097+0.10360408*I, -1.1760061+2.7146957*I, -0.4271084-0.042899966*I, -1.7228563-2.8335886*I, 1.8942566-0.6389735*I, 0.98274714+1.3824869*I, -1.8076122-0.8882447*I, 9.367975+0.0*I, -0.1838578-0.6468568*I, -1.8338387-0.7064959*I, 0.041852742+0.6556877*I, 2.5673025-1.9732997*I, -1.1148382+0.15693812*I, 2.4704504+1.0389464*I, 1.0858271+1.298006*I, 2.619998+1.8532984*I, 0.47806996+0.48494184*I, -0.1838578+0.6468568*I, 3.1117508+0.0*I, -1.956626-0.22825956*I, 0.07081801+0.31801307*I, 0.3698375+0.5400855*I, 0.80686307-1.5315914*I, 1.5649154+1.6229297*I, -0.112077385-1.2014246*I, -1.8306153-1.2336911*I, 0.5096429-0.5395974*I, -1.8338387+0.7064959*I, -1.956626+0.22825956*I, 3.6439795+0.0*I, -0.2594722-0.48786148*I, -0.47636223+0.27821827*I, -0.61608654+2.01858*I, -2.7767487-1.7693765*I, 0.048102796+0.9741874*I, 0.32275113+0.015575029*I, -0.7285097-0.10360408*I, 0.041852742-0.6556877*I, 0.07081801-0.31801307*I, -0.2594722+0.48786148*I, 3.624376+0.0*I, -1.6697118-0.4017511*I, -1.4397877+0.7550918*I, -0.31456697+1.0403451*I, -0.31978557-0.13701046*I, 2.1968813+1.0640624*I, -1.1760061-2.7146957*I, 2.5673025+1.9732997*I, 0.3698375-0.5400855*I, -0.47636223-0.27821827*I, -1.6697118+0.4017511*I, 6.8273163+0.0*I, -0.10051322-0.24303961*I, 1.4415971-0.29750675*I, 1.221786+0.85654986*I, 0.27894387+0.97911835*I, -0.4271084+0.042899966*I, -1.1148382-0.15693812*I, 0.80686307+1.5315914*I, -0.61608654-2.01858*I, -1.4397877-0.7550918*I, -0.10051322+0.24303961*I, 3.4057708+0.0*I, -0.5856801+1.0203559*I, 0.7103452-0.8422135*I, 3.0476584+0.18548489*I, -1.7228563+2.8335886*I, 2.4704504-1.0389464*I, 1.5649154-1.6229297*I, -2.7767487+1.7693765*I, -0.31456697-1.0403451*I, 1.4415971+0.29750675*I, -0.5856801-1.0203559*I, 7.005772+0.0*I, -0.9617417+1.2486815*I, 0.3842994+0.7050991*I, 1.8942566+0.6389735*I, 1.0858271-1.298006*I, -0.112077385+1.2014246*I, 0.048102796-0.9741874*I, -0.31978557+0.13701046*I, 1.221786-0.85654986*I, 0.7103452+0.8422135*I, -0.9617417-1.2486815*I, 3.4629636+0.0*I}; char up = 'U'; blasint n=10; blasint info[1]; BLASFUNC(cpotrf)(&up, &n, (float*)(A1), &n, info); //printf("%g+%g*I\n", creal(A1[91]), cimag(A1[91])); openblas_complex_double A2[100] = {3.0607147216796875+0.0*I, -0.5905849933624268-0.29020825028419495*I, 0.321084201335907+0.45168760418891907*I, 0.8387917876243591-0.644718587398529*I, -0.3642411530017853+0.051274992525577545*I, 0.8071482181549072+0.33944568037986755*I, 0.013674172572791576+0.21422699093818665*I, 0.35476258397102356+0.42408594489097595*I, -0.5991537570953369-0.23082709312438965*I, -0.0600702166557312-0.2113417387008667*I, -0.7954045534133911+0.7066076993942261*I, 2.807175397872925+0.0*I, -0.1691000759601593+0.313548743724823*I, -0.30911174416542053+0.7447023987770081*I, -0.22347848117351532+0.03316075727343559*I, -0.4088296890258789-1.0214389562606812*I, -0.2344931811094284+0.08056317269802094*I, 0.793269693851471-0.17507623136043549*I, 0.03163455054163933+0.20559945702552795*I, 0.13581633567810059-0.2110036462545395*I, 0.9827471375465393+1.3824869394302368*I, -1.8076121807098389-0.8882446885108948*I, 2.3277781009674072+0.0*I, 0.830405056476593-0.19296252727508545*I, 0.1394239068031311-0.5260677933692932*I, 1.239942193031311-0.09915469586849213*I, 0.06731037050485611-0.059320636093616486*I, 0.11507681757211685-0.1984301060438156*I, -0.6843825578689575+0.4647614359855652*I, 1.213119387626648-0.7757048010826111*I, 2.619997978210449+1.8532984256744385*I, 0.4780699610710144+0.48494184017181396*I, -0.18385779857635498+0.6468567848205566*I, 2.0811400413513184+0.0*I, -0.035075582563877106+0.09732913225889206*I, 0.27337002754211426-0.9032229781150818*I, -0.8374675512313843+0.0479498989880085*I, 0.6916252374649048+0.45711082220077515*I, 0.1883818507194519+0.06482727080583572*I, -0.32384994626045227+0.05857187137007713*I, -1.8306152820587158-1.2336910963058472*I, 0.5096428990364075-0.5395973920822144*I, -1.833838701248169+0.7064958810806274*I, -1.956626057624817+0.22825956344604492*I, 1.706615924835205+0.0*I, -0.2895336151123047+0.17579378187656403*I, -0.923172116279602-0.4530014097690582*I, 0.5040621757507324-0.37026339769363403*I, -0.2824432849884033-1.0374568700790405*I, 0.1399831622838974+0.4977008104324341*I, 0.32275113463401794+0.015575028955936432*I, -0.7285097241401672-0.10360407829284668*I, 0.041852742433547974-0.655687689781189*I, 0.07081800699234009-0.318013072013855*I, -0.25947219133377075+0.4878614842891693*I, 1.5735365152359009+0.0*I, -0.2647853195667267-0.26654252409935*I, -0.6190430521965027-0.24699924886226654*I, -0.6288471221923828+0.48154571652412415*I, 0.02446540631353855-0.2611822783946991*I, 2.1968812942504883+1.0640623569488525*I, -1.1760060787200928-2.714695692062378*I, 2.5673024654388428+1.9732997417449951*I, 0.3698374927043915-0.54008549451828*I, -0.4763622283935547-0.27821826934814453*I, -1.6697118282318115+0.4017511010169983*I, 1.2674795389175415+0.0*I, 0.3079095482826233-0.07258892804384232*I, -0.5929520130157471-0.038360968232154846*I, 0.04388086497783661-0.025549031794071198*I, 0.27894386649131775+0.9791183471679688*I, -0.42710840702056885+0.0428999662399292*I, -1.1148382425308228-0.1569381207227707*I, 0.8068630695343018+1.5315914154052734*I, -0.6160865426063538-2.0185799598693848*I, -1.439787745475769-0.7550917863845825*I, -0.10051321983337402+0.24303960800170898*I, 0.9066106081008911+0.0*I, 0.05315789580345154-0.06136537343263626*I, -0.21304509043693542+0.6494344472885132*I, 3.0476584434509277+0.1854848861694336*I, -1.7228562831878662+2.8335886001586914*I, 2.4704504013061523-1.0389463901519775*I, 1.564915418624878-1.6229296922683716*I, -2.7767486572265625+1.769376516342163*I, -0.314566969871521-1.0403450727462769*I, 1.4415971040725708+0.29750674962997437*I, -0.5856801271438599-1.0203559398651123*I, 0.5668219923973083+0.0*I, 0.033351436257362366-0.07832501083612442*I, 0.3842993974685669+0.7050991058349609*I, 1.894256591796875+0.6389734745025635*I, 1.085827112197876-1.2980060577392578*I, -0.11207738518714905+1.2014245986938477*I, 0.04810279607772827-0.9741873741149902*I, -0.31978556513786316+0.13701045513153076*I, 1.2217860221862793-0.856549859046936*I, 0.7103452086448669+0.84221351146698*I, -0.9617416858673096-1.2486815452575684*I, 0.0756804421544075+0.0*I}; openblas_complex_double B[20] = {-0.21782716937787788-0.9222220085490986*I, -0.7620356655676837+0.15533508334193666*I, -0.905011814118756+0.2847570854574069*I, -0.3451346708401685+1.076948486041297*I, 0.25336108035924787+0.975317836492159*I, 0.11192755545114-0.1603741874112385*I, -0.20604111555491242+0.10570814584017311*I, -1.0568488936791578-0.06025820467086475*I, -0.6650468984506477-0.5000967284800251*I, -1.0509472322215125+0.5022165705328413*I, -0.727775859267237+0.50638268521728*I, 0.39947219167701153-0.4576746001199889*I, -0.7122162951294634-0.630289556702497*I, 0.9870834574024372-0.2825689605519449*I, 0.0628393808469436-0.1253397353973715*I, 0.8439562576196216+1.0850814110398734*I, 0.562377322638969-0.2578030745663871*I, 0.12696236014017806-0.09853584666755086*I, -0.023682508769195098+0.18093440285319276*I, -0.7264975746431271+0.31670415674097235*I}; char lo = 'L'; blasint nrhs = 2; BLASFUNC(zpotrs)(&lo, &n, &nrhs, (double*)(A2), &n, (double*)(B), &n, info); // note that this is exactly equal to A1 openblas_complex_float A3[100] = {5.8525753+0.0*I, -0.79540455-0.7066077*I, 0.98274714-1.3824869*I, 2.619998-1.8532984*I, -1.8306153+1.2336911*I, 0.32275113-0.015575029*I, 2.1968813-1.0640624*I, 0.27894387-0.97911835*I, 3.0476584-0.18548489*I, 0.3842994-0.7050991*I, -0.79540455+0.7066077*I, 8.313246+0.0*I, -1.8076122+0.8882447*I, 0.47806996-0.48494184*I, 0.5096429+0.5395974*I, -0.7285097+0.10360408*I, -1.1760061+2.7146957*I, -0.4271084-0.042899966*I, -1.7228563-2.8335886*I, 1.8942566-0.6389735*I, 0.98274714+1.3824869*I, -1.8076122-0.8882447*I, 9.367975+0.0*I, -0.1838578-0.6468568*I, -1.8338387-0.7064959*I, 0.041852742+0.6556877*I, 2.5673025-1.9732997*I, -1.1148382+0.15693812*I, 2.4704504+1.0389464*I, 1.0858271+1.298006*I, 2.619998+1.8532984*I, 0.47806996+0.48494184*I, -0.1838578+0.6468568*I, 3.1117508+0.0*I, -1.956626-0.22825956*I, 0.07081801+0.31801307*I, 0.3698375+0.5400855*I, 0.80686307-1.5315914*I, 1.5649154+1.6229297*I, -0.112077385-1.2014246*I, -1.8306153-1.2336911*I, 0.5096429-0.5395974*I, -1.8338387+0.7064959*I, -1.956626+0.22825956*I, 3.6439795+0.0*I, -0.2594722-0.48786148*I, -0.47636223+0.27821827*I, -0.61608654+2.01858*I, -2.7767487-1.7693765*I, 0.048102796+0.9741874*I, 0.32275113+0.015575029*I, -0.7285097-0.10360408*I, 0.041852742-0.6556877*I, 0.07081801-0.31801307*I, -0.2594722+0.48786148*I, 3.624376+0.0*I, -1.6697118-0.4017511*I, -1.4397877+0.7550918*I, -0.31456697+1.0403451*I, -0.31978557-0.13701046*I, 2.1968813+1.0640624*I, -1.1760061-2.7146957*I, 2.5673025+1.9732997*I, 0.3698375-0.5400855*I, -0.47636223-0.27821827*I, -1.6697118+0.4017511*I, 6.8273163+0.0*I, -0.10051322-0.24303961*I, 1.4415971-0.29750675*I, 1.221786+0.85654986*I, 0.27894387+0.97911835*I, -0.4271084+0.042899966*I, -1.1148382-0.15693812*I, 0.80686307+1.5315914*I, -0.61608654-2.01858*I, -1.4397877-0.7550918*I, -0.10051322+0.24303961*I, 3.4057708+0.0*I, -0.5856801+1.0203559*I, 0.7103452-0.8422135*I, 3.0476584+0.18548489*I, -1.7228563+2.8335886*I, 2.4704504-1.0389464*I, 1.5649154-1.6229297*I, -2.7767487+1.7693765*I, -0.31456697-1.0403451*I, 1.4415971+0.29750675*I, -0.5856801-1.0203559*I, 7.005772+0.0*I, -0.9617417+1.2486815*I, 0.3842994+0.7050991*I, 1.8942566+0.6389735*I, 1.0858271-1.298006*I, -0.112077385+1.2014246*I, 0.048102796-0.9741874*I, -0.31978557+0.13701046*I, 1.221786-0.85654986*I, 0.7103452+0.8422135*I, -0.9617417-1.2486815*I, 3.4629636+0.0*I}; BLASFUNC(cpotrf)(&up, &n, (float*)(A3), &n, info); // printf("%g+%g*I\n", creal(A3[91]), cimag(A3[91])); if(isnan(CREAL(A3[91])) || isnan(CIMAG(A3[91]))) { CTEST_ERR("%s:%d got NaN", __FILE__, __LINE__); } }
void blas_shutdown(void){ int pos; #ifdef SMP BLASFUNC(blas_thread_shutdown)(); #endif LOCK_COMMAND(&alloc_lock); for (pos = 0; pos < release_pos; pos ++) { release_info[pos].func(&release_info[pos]); } #ifdef SEEK_ADDRESS base_address = 0UL; #else base_address = BASE_ADDRESS; #endif for (pos = 0; pos < NUM_BUFFERS; pos ++){ memory[pos].addr = (void *)0; memory[pos].used = 0; #if defined(WHEREAMI) && !defined(USE_OPENMP) memory[pos].pos = -1; #endif memory[pos].lock = 0; } UNLOCK_COMMAND(&alloc_lock); return; }
CTEST(amax, samax){ blasint N=3, inc=1; float te_max=0.0, tr_max=0.0; float x[]={-1.1, 2.2, -3.3}; te_max=BLASFUNC(samax)(&N, x, &inc); tr_max=3.3; ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); }
void test_samax() { int N=3, inc=1; float te_max=0.0, tr_max=0.0; float x[]={-1.1, 2.2, -3.3}; te_max=BLASFUNC(samax)(&N, x, &inc); tr_max=BLASFUNC_REF(samax)(&N, x, &inc); CU_ASSERT_DOUBLE_EQUAL(te_max, tr_max, CHECK_EPS); }
void openblas_fork_handler() { // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is // built with "make USE_OPENMP=0". // Hanging can still happen when OpenBLAS is built against the libgomp // implementation of OpenMP. The problem is tracked at: // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035 // In the mean time build with USE_OPENMP=0 or link against another // implementation of OpenMP. #if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER) int err; err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL); if(err != 0) openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n"); #endif }
void test_zdotu_offset_1(void) { int N=1,incX=1,incY=1; double x1[]={1.0,2.0,3.0,4.0}; double y1[]={5.0,6.0,7.0,8.0}; double x2[]={1.0,2.0,3.0,4.0}; double y2[]={5.0,6.0,7.0,8.0}; double _Complex result1=0.0; double _Complex result2=0.0; //OpenBLAS result1=BLASFUNC(zdotu)(&N,x1+1,&incX,y1+1,&incY); //reference result2=BLASFUNC_REF(zdotu)(&N,x2+1,&incX,y2+1,&incY); CU_ASSERT_DOUBLE_EQUAL(creal(result1), creal(result2), CHECK_EPS); CU_ASSERT_DOUBLE_EQUAL(cimag(result1), cimag(result2), CHECK_EPS); // printf("\%lf,%lf\n",creal(result1),cimag(result1)); }
void test_csrot_inc_0(void) { int i=0; int N=4,incX=0,incY=0; float c=0.25,s=0.5; float x1[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; float y1[]={2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0}; float x2[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; float y2[]={2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0}; //OpenBLAS BLASFUNC(csrot)(&N,x1,&incX,y1,&incY,&c,&s); //reference BLASFUNC_REF(csrot)(&N,x2,&incX,y2,&incY,&c,&s); for(i=0; i<2*N; i++){ CU_ASSERT_DOUBLE_EQUAL(x1[i], x2[i], CHECK_EPS); CU_ASSERT_DOUBLE_EQUAL(y1[i], y2[i], CHECK_EPS); } }
void test_drot_inc_0(void) { int i=0; int N=4,incX=0,incY=0; double c=0.25,s=0.5; double x1[]={1.0,3.0,5.0,7.0}; double y1[]={2.0,4.0,6.0,8.0}; double x2[]={1.0,3.0,5.0,7.0}; double y2[]={2.0,4.0,6.0,8.0}; //OpenBLAS BLASFUNC(drot)(&N,x1,&incX,y1,&incY,&c,&s); //reference BLASFUNC_REF(drot)(&N,x2,&incX,y2,&incY,&c,&s); for(i=0; i<N; i++){ CU_ASSERT_DOUBLE_EQUAL(x1[i], x2[i], CHECK_EPS); CU_ASSERT_DOUBLE_EQUAL(y1[i], y2[i], CHECK_EPS); } }
void NAME(blasint *M, blasint *N, FLOAT *Alpha, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a, blasint *LDA){ blasint m = *M; blasint n = *N; FLOAT alpha = *Alpha; blasint incx = *INCX; blasint incy = *INCY; blasint lda = *LDA; FLOAT *buffer; #ifdef SMP int nthreads; #endif blasint info; PRINT_DEBUG_NAME; info = 0; if (lda < MAX(1,m)) info = 9; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (m < 0) info = 1; if (info){ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, blasint m, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) { FLOAT *buffer; blasint info, t; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; info = 0; if (order == CblasColMajor) { info = -1; if (lda < MAX(1,m)) info = 9; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (m < 0) info = 1; } if (order == CblasRowMajor) { info = -1; t = n; n = m; m = t; t = incx; incx = incy; incy = t; buffer = x; x = y; y = buffer; if (lda < MAX(1,m)) info = 9; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (m < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif /* Quick return if possible. */ if (m == 0 || n == 0) return; if (alpha == 0.) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incy < 0) y -= (n - 1) * incy; if (incx < 0) x -= (m - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif GER(m, n, 0, alpha, x, incx, y, incy, a, lda, buffer); #ifdef SMP } else { GER_THREAD(m, n, alpha, x, incx, y, incy, a, lda, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); IDEBUG_END; return; }
void NAME( char* ORDER, char* TRANS, blasint *rows, blasint *cols, FLOAT *alpha, FLOAT *a, blasint *lda, blasint *ldb) { char Order, Trans; int order=-1,trans=-1; blasint info = -1; FLOAT *b; size_t msize; Order = *ORDER; Trans = *TRANS; TOUPPER(Order); TOUPPER(Trans); if ( Order == 'C' ) order = BlasColMajor; if ( Order == 'R' ) order = BlasRowMajor; if ( Trans == 'N' ) trans = BlasNoTrans; if ( Trans == 'R' ) trans = BlasNoTrans; if ( Trans == 'T' ) trans = BlasTrans; if ( Trans == 'C' ) trans = BlasTrans; #else void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, blasint ccols, FLOAT calpha, FLOAT *a, blasint clda, blasint cldb) { char Order, Trans; int order=-1,trans=-1; blasint info = -1; FLOAT *b; size_t msize; blasint *lda, *ldb, *rows, *cols; FLOAT *alpha; if ( CORDER == CblasColMajor) order = BlasColMajor; if ( CORDER == CblasRowMajor) order = BlasRowMajor; if ( CTRANS == CblasNoTrans || CTRANS == CblasConjNoTrans) trans = BlasNoTrans; if ( CTRANS == CblasTrans || CTRANS == CblasConjTrans ) trans = BlasTrans; rows = &crows; cols = &ccols; alpha = &calpha; lda = &clda; ldb = &cldb; #endif if ( order == BlasColMajor) { if ( trans == BlasNoTrans && *ldb < *rows ) info = 9; if ( trans == BlasTrans && *ldb < *cols ) info = 9; } if ( order == BlasRowMajor) { if ( trans == BlasNoTrans && *ldb < *cols ) info = 9; if ( trans == BlasTrans && *ldb < *rows ) info = 9; } if ( order == BlasColMajor && *lda < *rows ) info = 7; if ( order == BlasRowMajor && *lda < *cols ) info = 7; if ( *cols <= 0 ) info = 4; if ( *rows <= 0 ) info = 3; if ( trans < 0 ) info = 2; if ( order < 0 ) info = 1; if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } if ( *lda > *ldb ) msize = (*lda) * (*ldb) * sizeof(FLOAT); else msize = (*ldb) * (*ldb) * sizeof(FLOAT); b = malloc(msize); if ( b == NULL ) { printf("Memory alloc failed\n"); exit(1); } if ( order == BlasColMajor ) { if ( trans == BlasNoTrans ) { OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *ldb ); OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *ldb, a, *ldb ); } else { OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *ldb ); OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, b, *ldb, a, *ldb ); } } else { if ( trans == BlasNoTrans ) { OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *ldb ); OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *ldb, a, *ldb ); } else { OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *ldb ); OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *ldb, a, *ldb ); } } free(b); return; }
void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha = *ALPHA; blasint lda = *LDA; blasint incx = *INCX; FLOAT beta = *BETA; blasint incy = *INCY; int (*symv[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { SYMV_U, SYMV_L, }; #ifdef SMP int (*symv_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { SYMV_THREAD_U, SYMV_THREAD_L, }; #endif blasint info; int uplo; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incy == 0) info = 10; if (incx == 0) info = 7; if (lda < MAX(1, n)) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT beta, FLOAT *y, blasint incy) { FLOAT *buffer; int uplo; blasint info; #ifdef SMP int nthreads; #endif int (*symv[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { SYMV_U, SYMV_L, }; #ifdef SMP int (*symv_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { SYMV_THREAD_U, SYMV_THREAD_L, }; #endif PRINT_DEBUG_CNAME; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; if (incy == 0) info = 10; if (incx == 0) info = 7; if (lda < MAX(1, n)) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; info = -1; if (incy == 0) info = 10; if (incx == 0) info = 7; if (lda < MAX(1, n)) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; if (incy < 0 ) y -= (n - 1) * incy; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (symv[uplo])(n, n, alpha, a, lda, x, incx, y, incy, buffer); #ifdef SMP } else { (symv_thread[uplo])(n, alpha, a, lda, x, incx, y, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(1, n * n / 2 + 2 * n, 2 * n * n); IDEBUG_END; return; }
void NAME(char *TRANSA, char *TRANSB, blasint *M, blasint *N, blasint *K, FLOAT *alpha, FLOAT *a, blasint *ldA, FLOAT *b, blasint *ldB, FLOAT *beta, FLOAT *c, blasint *ldC){ blas_arg_t args; int transa, transb, nrowa, nrowb; blasint info; char transA, transB; FLOAT *buffer; FLOAT *sa, *sb; #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif #if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3) int nodes; #endif PRINT_DEBUG_NAME; args.m = *M; args.n = *N; args.k = *K; args.a = (void *)a; args.b = (void *)b; args.c = (void *)c; args.lda = *ldA; args.ldb = *ldB; args.ldc = *ldC; args.alpha = (void *)alpha; args.beta = (void *)beta; transA = *TRANSA; transB = *TRANSB; TOUPPER(transA); TOUPPER(transB); transa = -1; transb = -1; if (transA == 'N') transa = 0; if (transA == 'T') transa = 1; #ifndef COMPLEX if (transA == 'R') transa = 0; if (transA == 'C') transa = 1; #else if (transA == 'R') transa = 2; if (transA == 'C') transa = 3; #endif if (transB == 'N') transb = 0; if (transB == 'T') transb = 1; #ifndef COMPLEX if (transB == 'R') transb = 0; if (transB == 'C') transb = 1; #else if (transB == 'R') transb = 2; if (transB == 'C') transb = 3; #endif nrowa = args.m; if (transa & 1) nrowa = args.k; nrowb = args.k; if (transb & 1) nrowb = args.n; info = 0; if (args.ldc < args.m) info = 13; if (args.ldb < nrowb) info = 10; if (args.lda < nrowa) info = 8; if (args.k < 0) info = 5; if (args.n < 0) info = 4; if (args.m < 0) info = 3; if (transb < 0) info = 2; if (transa < 0) info = 1; if (info){ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m, blasint n, blasint k, #ifndef COMPLEX FLOAT alpha, #else FLOAT *alpha, #endif FLOAT *a, blasint lda, FLOAT *b, blasint ldb, #ifndef COMPLEX FLOAT beta, #else FLOAT *beta, #endif FLOAT *c, blasint ldc) { blas_arg_t args; int transa, transb; blasint nrowa, nrowb, info; XFLOAT *buffer; XFLOAT *sa, *sb; #ifdef SMP #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_REAL; #else int mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) int mode = BLAS_DOUBLE | BLAS_COMPLEX; #else int mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif #endif #if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3) int nodes; #endif PRINT_DEBUG_CNAME; #ifndef COMPLEX args.alpha = (void *)α args.beta = (void *)β #else args.alpha = (void *)alpha; args.beta = (void *)beta; #endif transa = -1; transb = -1; info = 0; if (order == CblasColMajor) { args.m = m; args.n = n; args.k = k; args.a = (void *)a; args.b = (void *)b; args.c = (void *)c; args.lda = lda; args.ldb = ldb; args.ldc = ldc; if (TransA == CblasNoTrans) transa = 0; if (TransA == CblasTrans) transa = 1; #ifndef COMPLEX if (TransA == CblasConjNoTrans) transa = 0; if (TransA == CblasConjTrans) transa = 1; #else if (TransA == CblasConjNoTrans) transa = 2; if (TransA == CblasConjTrans) transa = 3; #endif if (TransB == CblasNoTrans) transb = 0; if (TransB == CblasTrans) transb = 1; #ifndef COMPLEX if (TransB == CblasConjNoTrans) transb = 0; if (TransB == CblasConjTrans) transb = 1; #else if (TransB == CblasConjNoTrans) transb = 2; if (TransB == CblasConjTrans) transb = 3; #endif nrowa = args.m; if (transa & 1) nrowa = args.k; nrowb = args.k; if (transb & 1) nrowb = args.n; info = -1; if (args.ldc < args.m) info = 13; if (args.ldb < nrowb) info = 10; if (args.lda < nrowa) info = 8; if (args.k < 0) info = 5; if (args.n < 0) info = 4; if (args.m < 0) info = 3; if (transb < 0) info = 2; if (transa < 0) info = 1; } if (order == CblasRowMajor) { args.m = n; args.n = m; args.k = k; args.a = (void *)b; args.b = (void *)a; args.c = (void *)c; args.lda = ldb; args.ldb = lda; args.ldc = ldc; if (TransB == CblasNoTrans) transa = 0; if (TransB == CblasTrans) transa = 1; #ifndef COMPLEX if (TransB == CblasConjNoTrans) transa = 0; if (TransB == CblasConjTrans) transa = 1; #else if (TransB == CblasConjNoTrans) transa = 2; if (TransB == CblasConjTrans) transa = 3; #endif if (TransA == CblasNoTrans) transb = 0; if (TransA == CblasTrans) transb = 1; #ifndef COMPLEX if (TransA == CblasConjNoTrans) transb = 0; if (TransA == CblasConjTrans) transb = 1; #else if (TransA == CblasConjNoTrans) transb = 2; if (TransA == CblasConjTrans) transb = 3; #endif nrowa = args.m; if (transa & 1) nrowa = args.k; nrowb = args.k; if (transb & 1) nrowb = args.n; info = -1; if (args.ldc < args.m) info = 13; if (args.ldb < nrowb) info = 10; if (args.lda < nrowa) info = 8; if (args.k < 0) info = 5; if (args.n < 0) info = 4; if (args.m < 0) info = 3; if (transb < 0) info = 2; if (transa < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if ((args.m == 0) || (args.n == 0)) return; #if 0 fprintf(stderr, "m = %4d n = %d k = %d lda = %4d ldb = %4d ldc = %4d\n", args.m, args.n, args.k, args.lda, args.ldb, args.ldc); #endif IDEBUG_START; FUNCTION_PROFILE_START(); buffer = (XFLOAT *)blas_memory_alloc(0); sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A); sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #ifdef SMP mode |= (transa << BLAS_TRANSA_SHIFT); mode |= (transb << BLAS_TRANSB_SHIFT); args.common = NULL; args.nthreads = num_cpu_avail(3); if (args.nthreads == 1) { #endif (gemm[(transb << 2) | transa])(&args, NULL, NULL, sa, sb, 0); #ifdef SMP } else { #ifndef USE_SIMPLE_THREADED_LEVEL3 #ifndef NO_AFFINITY nodes = get_num_nodes(); if ((nodes > 1) && get_node_equal()) { args.nthreads /= nodes; gemm_thread_mn(mode, &args, NULL, NULL, gemm[16 | (transb << 2) | transa], sa, sb, nodes); } else { #endif (gemm[16 | (transb << 2) | transa])(&args, NULL, NULL, sa, sb, 0); #else GEMM_THREAD(mode, &args, NULL, NULL, gemm[(transb << 2) | transa], sa, sb, args.nthreads); #endif #ifndef USE_SIMPLE_THREADED_LEVEL3 #ifndef NO_AFFINITY } #endif #endif #endif #ifdef SMP } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.k + args.k * args.n + args.m * args.n, 2 * args.m * args.n * args.k); IDEBUG_END; return; }
void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *a){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; blasint incx = *INCX; blasint info; int uplo; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } if (n == 0) return; if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (spr[uplo])(n, alpha_r, alpha_i, x, incx, a, buffer); #ifdef SMP } else { (spr_thread[uplo])(n, ALPHA, x, incx, a, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); IDEBUG_END; return; }
int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ blas_arg_t args; blasint uplo_arg = *UPLO; blasint uplo; blasint info; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.n = *N; args.a = (void *)a; args.lda = *ldA; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (args.lda < MAX(1,args.n)) info = 4; if (args.n < 0) info = 2; if (uplo < 0) info = 1; if (info) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); *Info = - info; return 0; } *Info = 0; if (args.n == 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif #ifdef SMP args.common = NULL; args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif *Info = (lauum_single[uplo])(&args, NULL, NULL, sa, sb, 0); #ifdef SMP } else { *Info = (lauum_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); } #endif #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(1, .5 * args.n * args.n, 2. * args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + args.n * (args.n * args.n - 1)); IDEBUG_END; return 0; }
int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ blas_arg_t args; blasint uplo_arg = *UPLO; blasint diag_arg = *DIAG; blasint uplo, diag; blasint info; FLOAT *buffer; #ifdef PPC440 extern #endif FLOAT *sa, *sb; PRINT_DEBUG_NAME; args.n = *N; args.a = (void *)a; args.lda = *ldA; TOUPPER(uplo_arg); TOUPPER(diag_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; diag = -1; if (diag_arg == 'U') diag = 0; if (diag_arg == 'N') diag = 1; info = 0; if (args.lda < MAX(1,args.n)) info = 5; if (args.n < 0) info = 3; if (diag < 0) info = 2; if (uplo < 0) info = 1; if (info) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); *Info = - info; return 0; } *Info = 0; if (args.n <= 0) return 0; IDEBUG_START; FUNCTION_PROFILE_START(); #ifndef PPC440 buffer = (FLOAT *)blas_memory_alloc(1); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #endif info = (trti2[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); *Info = info; #ifndef PPC440 blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(1, .5 * args.n * args.n, 2. * args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + 6. * args.n * (1./3. + args.n * (-1./2. + args.n * 1./6.))); IDEBUG_END; return 0; }
int main(int argc, char *argv[]) { int i,j,k; machineInformation currentMachine; counterSessionInfo session; initializeCUDA(); // Set machine information from CounterHomeBrew.h currentMachine.cpu_model = CPU_MODEL; currentMachine.num_sockets = NUM_SOCKETS; currentMachine.num_phys_cores_per_socket = NUM_PHYS_CORES_PER_SOCKET; currentMachine.num_cores_per_socket = NUM_CORES_PER_SOCKET; currentMachine.num_cores = NUM_CORES; currentMachine.num_cbos = NUM_PHYS_CORES_PER_SOCKET; // should multiply by NUM_SOCKETS??? currentMachine.core_gen_counter_num_max = CORE_GEN_COUNTER_MAX; currentMachine.cbo_counter_num_max = CBO_COUNTER_NUM_MAX; // Set session events, umasks and counters used // int32 core_event_numbers[] = {FP_COMP_OPS_EXE_EVTNR,SIMD_FP_256_EVTNR,0x51,0xF1,0x80}; // int32 core_umasks[] = {FP_COMP_OPS_EXE_SCALAR_DOUBLE_UMASK,SIMD_FP_256_PACKED_DOUBLE_UMASK,0x01, 0x07,0x01}; session.core_gen_counter_num_used = 5; int32 core_event_numbers[] = {0x10,0x10,0x11,0x51,0xF1}; int32 core_umasks[] = {0x20,0x40,0x01,0x01, 0x07}; session.cbo_counter_num_used = 1; int32 cbo_event_numbers[] = {0x37}; int32 cbo_umasks[] = {0xf}; session.cbo_filter = 0x1f; for (i = 0; i < session.core_gen_counter_num_used; i++) { session.core_event_numbers[i] = core_event_numbers[i]; session.core_umasks[i] = core_umasks[i]; } for (i = 0; i < session.cbo_counter_num_used; i++) { session.cbo_event_numbers[i] = cbo_event_numbers[i]; session.cbo_umasks[i] = cbo_umasks[i]; } int fd[NUM_CORES]; // Arrays to hold counter data... counterData before; counterData after; // some data for doing a naive matmul to test flop counting... // initloop(N); // M,N,K are multiples of the block size.... int gpuOuter = atoi(argv[1]); int gpuInner = atoi(argv[2]); int cpuInner = atoi(argv[3]); double minRuntime = atoi(argv[4]); int Md = atoi(argv[5])*block_size; int Nd = atoi(argv[6])*block_size; int Kd = atoi(argv[7])*block_size; int Mh = atoi(argv[8]); int Nh = atoi(argv[9]); int Kh = atoi(argv[10]); char *ts1,*ts2,*ts3,*ts4; char *ts5,*ts6,*ts7,*ts8; double fineTimeStamps[8]; double gTime = 0.0; double cTime = 0.0; double seconds = 0.0; int num_iters; uint64 *coreSums; coreSums = (uint64*)calloc(currentMachine.num_sockets*session.core_gen_counter_num_used,sizeof(uint64)); uint64 *sums; sums = (uint64*)calloc(currentMachine.num_sockets*session.cbo_counter_num_used,sizeof(uint64)); float *Atmp = NULL; float *Btmp = NULL; float *Ctmp = NULL; Atmp = (float*) malloc( Mh * Nh * sizeof(float) ); Btmp = (float*) malloc( Nh * sizeof(float) ); Ctmp = (float*) malloc( Mh * sizeof(float) ); randomInit(Atmp,Mh*Nh); randomInit(Btmp,Nh); for (num_iters = cpuInner; seconds < minRuntime; num_iters *=2) { seconds = 0.0; for (i =0; i < num_iters; i++) BLASFUNC( CblasColMajor,CblasNoTrans,Mh,Nh, 1, Atmp,Mh, Btmp,1, 1, Ctmp,1 ); seconds = read_timer()-seconds; } // num_iters /= 2; free(Atmp); free(Btmp); free(Ctmp); int readyThreads = 0; #pragma omp parallel { int threadNum = omp_get_thread_num(); int numThreads = omp_get_num_threads(); assert(numThreads==2); if (threadNum == 0) { cudaError_t error; int memSizeA = sizeof(float)*Md*Nd; int memSizeB = sizeof(float)*Nd; int memSizeC = sizeof(float)*Md; float *Ahost,*Bhost,*Chost; // use pinned memory on the host for BW and asynch memory transfers.. int flags = cudaHostAllocDefault; ts5 = getTimeStamp(); fineTimeStamps[0] = read_timer(); error = cudaHostAlloc((void**)&Ahost,memSizeA,flags);if (error != cudaSuccess){printf("cudaHostMalloc Ahost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaHostAlloc((void**)&Bhost,memSizeB,flags);if (error != cudaSuccess){printf("cudaHostMalloc Bhost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaHostAlloc((void**)&Chost,memSizeC,flags);if (error != cudaSuccess){printf("cudaHostMalloc Chost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} // set local arrays randomInit(Ahost,Md*Nd); randomInit(Bhost,Nd); // allocate device memory float *Adevice,*Bdevice,*Cdevice; error = cudaMalloc((void**)&Adevice,memSizeA); if (error != cudaSuccess){printf("cudaMalloc Adevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaMalloc((void**)&Bdevice,memSizeB); if (error != cudaSuccess){printf("cudaMalloc Bdevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaMalloc((void**)&Cdevice,memSizeC); if (error != cudaSuccess){printf("cudaMalloc Cdevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} fineTimeStamps[1] = read_timer(); ts6 = getTimeStamp(); #pragma omp critical { readyThreads += 1; } // fprintf(stderr,"Incremented ready GPU\n"); while (readyThreads < 2){sleep(1);fprintf(stderr,"Thread 0: %d\n",readyThreads);}; //#pragma omp single //{ cudaStream_t stream1; cudaStreamCreate ( &stream1) ; ts3 = getTimeStamp(); fineTimeStamps[2] = read_timer(); gTime = read_timer(); for (int i = 0; i < gpuOuter; i++) GPUsgemv(gpuInner,Md,Nd,Kd,Adevice,Bdevice,Cdevice,Ahost,Bhost,Chost,&stream1); cudaStreamSynchronize(stream1); gTime = read_timer() - gTime; fineTimeStamps[3] = read_timer(); ts4 = getTimeStamp(); cudaFreeHost(Ahost); cudaFreeHost(Bhost); cudaFreeHost(Chost); } else { // uint64 min_iters = strtoull(argv[4],NULL,0); float *A = NULL; float *B = NULL; float *C = NULL; ts7 = getTimeStamp(); fineTimeStamps[4] = read_timer(); A = (float*) malloc( Mh * Nh * sizeof(float) ); B = (float*) malloc( Nh * sizeof(float) ); C = (float*) malloc( Mh * sizeof(float) ); randomInit(A,Mh*Nh); randomInit(B,Nh); fineTimeStamps[5] = read_timer(); ts8 = getTimeStamp(); #pragma omp critical { readyThreads += 1; } // fprintf(stderr,"Incremented ready CPU\n"); while (readyThreads < 2){sleep(1);fprintf(stderr,"Thread 1: %d\n",readyThreads);}; // open the msr files for each core on the machine for (i = 0; i < currentMachine.num_cores; i++) open_msr_file(i,&fd[i]); int socketsProgrammed = 0; for (i = 0; i < currentMachine.num_cores; i++) { int currentCoreFD = fd[i]; stopCounters(i, currentCoreFD, ¤tMachine, &session); programCoreFixedCounters(currentCoreFD); programGeneralPurposeRegisters(currentCoreFD, ¤tMachine, &session); /* Program the Uncore as desired...*/ // Only program the first physical core on each socket. // NOTE: Some assumptions about topology here...check /proc/cpuinfo to confirm. if (i % currentMachine.num_phys_cores_per_socket == 0 && socketsProgrammed < currentMachine.num_sockets) { programUncoreCounters( currentCoreFD, ¤tMachine, &session); socketsProgrammed++; } } seconds = 0.0; // start the programmed counters... for (i = 0; i < currentMachine.num_cores; i++) startCounters( i, fd[i], ¤tMachine, &session); /* READ COUNTERS BEFORE STUFF */ readCounters(fd,¤tMachine,&session, &before); ts1 = getTimeStamp(); fineTimeStamps[6] = read_timer(); seconds = read_timer(); /* DO STUFF */ for (i =0; i < num_iters; i++) BLASFUNC( CblasColMajor,CblasNoTrans,Mh,Nh, 1, A,Mh, B,1, 1, C,1 ); /* END DOING STUFF */ seconds = read_timer()-seconds; fineTimeStamps[7] = read_timer(); ts2 = getTimeStamp(); /* READ COUNTERS AFTER STUFF */ for (i = 0; i < currentMachine.num_cores; i++) stopCounters(i,fd[i],¤tMachine, &session); // printf("num_iters = %"PRIu64", runtime is %g\n",num_iters,seconds); readCounters(fd,¤tMachine,&session,&after); diffCounterData(¤tMachine, &session, &after, &before, &after); for (i = 0; i < currentMachine.num_sockets; i++) { // printf("Socket %d\n",i); for (j = 0; j < currentMachine.num_cores_per_socket; j++) { // printf("%d,",j); for (k = 0; k < session.core_gen_counter_num_used; k++){ // printf("%"PRIu64",",after.generalCore[i*currentMachine.num_cores_per_socket + j][k]); // bug in the indexing of the core sums??? // coreSums[i*session.core_gen_counter_num_used + k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k]; coreSums[k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k]; } // printf("\n"); } } for (i = 0; i < currentMachine.num_sockets; i++) { // printf("%d,",i); for (j = 0; j < currentMachine.num_cbos; j++) { // printf("%d,",j); for (k = 0; k < session.cbo_counter_num_used; k++) { // printf("%llu,",after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]); // bug in the indexing of the core sums??? // sums[i*session.cbo_counter_num_used + k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]; sums[k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]; } } } // printf("\n"); // Stop counters, reset PMU, close msr files cleanup(fd,¤tMachine,&session); free(A); free(B); free(C); } } // end parallel region printf("%s,%s,%s,%s,%s,%s,%s,%s,%d,%d,%d,%d,%d,%d,%d,%d,%d,%f,%f,%f,",ts7,ts8,ts1,ts2,ts5,ts6,ts3,ts4,Mh,Nh,Kh,Md/block_size,Nd/block_size,Kd/block_size,num_iters,gpuOuter,gpuInner,seconds,gTime,(float)(gpuOuter*(Md*Kd+Nd+Md))/16.0); for (int i = 0; i < 8; i++) printf("%f,",fineTimeStamps[i]); for (j = 0; j < session.core_gen_counter_num_used; j++) printf("%llu,",coreSums[j]); for (j = 0; j < session.cbo_counter_num_used; j++) if (j == session.cbo_counter_num_used-1) printf("%llu",sums[j]); else printf("%llu,",sums[j]); printf("\n"); free(sums); free(coreSums); return 0; }
void NAME(char *TRANS, blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char trans = *TRANS; blasint m = *M; blasint n = *N; blasint lda = *LDA; blasint incx = *INCX; blasint incy = *INCY; FLOAT alpha = *ALPHA; FLOAT beta = *BETA; FLOAT *buffer; #ifdef SMP int nthreads; int nthreads_max; int nthreads_avail; double MNK; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { GEMV_N, GEMV_T, }; blasint info; blasint lenx, leny; blasint i; PRINT_DEBUG_NAME; TOUPPER(trans); info = 0; i = -1; if (trans == 'N') i = 0; if (trans == 'T') i = 1; if (trans == 'R') i = 0; if (trans == 'C') i = 1; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1, m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (i < 0) info = 1; trans = i; if (info != 0){ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasint n, FLOAT alpha, FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT beta, FLOAT *y, blasint incy){ FLOAT *buffer; blasint lenx, leny; int trans; blasint info, t; #ifdef SMP int nthreads; int nthreads_max; int nthreads_avail; double MNK; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { GEMV_N, GEMV_T, }; PRINT_DEBUG_CNAME; trans = -1; info = 0; if (order == CblasColMajor) { if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 0; if (TransA == CblasConjTrans) trans = 1; info = -1; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1, m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (order == CblasRowMajor) { if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 1; if (TransA == CblasConjTrans) trans = 0; info = -1; t = n; n = m; m = t; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1, m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif //printf("m=%d, n=%d, trans=%d, incx=%d, incy=%d, alpha=%f, beta=%f\n", m, n, trans, incx, incy, alpha, beta); if ((m==0) || (n==0)) return; lenx = n; leny = m; if (trans) lenx = m; if (trans) leny = n; if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (lenx - 1) * incx; if (incy < 0) y -= (leny - 1) * incy; #ifdef MAX_STACK_ALLOC // make it volatile because some gemv implementation (ex: dgemv_n.S) // do not restore all register volatile int stack_alloc_size = 0; //for gemv_n and gemv_t, try to allocate on stack stack_alloc_size = m + n; #ifdef ALIGNED_ACCESS stack_alloc_size += 3; #endif // if(stack_alloc_size < 128) //dgemv_n.S require a 128 bytes buffer // increasing instead of capping 128 // ABI STACK for windows 288 bytes stack_alloc_size += 288 / sizeof(FLOAT) ; if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT)) stack_alloc_size = 0; // stack overflow check volatile double stack_check = 3.14159265358979323846; FLOAT stack_buffer[stack_alloc_size]; buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1); // printf("stack_alloc_size=%d\n", stack_alloc_size); #else //Original OpenBLAS/GotoBLAS codes. buffer = (FLOAT *)blas_memory_alloc(1); #endif #ifdef SMP nthreads_max = num_cpu_avail(2); nthreads_avail = nthreads_max; MNK = (double) m * (double) n; if ( MNK <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) ) ) nthreads_max = 1; if ( nthreads_max > nthreads_avail ) nthreads = nthreads_avail; else nthreads = nthreads_max; if (nthreads == 1) { #endif (gemv[(int)trans])(m, n, 0, alpha, a, lda, x, incx, y, incy, buffer); #ifdef SMP } else { (gemv_thread[(int)trans])(m, n, alpha, a, lda, x, incx, y, incy, buffer, nthreads); } #endif // stack overflow check assert(stack_check==3.14159265358979323846); #ifdef MAX_STACK_ALLOC if(!stack_alloc_size){ blas_memory_free(buffer); } #else blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); IDEBUG_END; return; }
void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *b, blasint *INCX, FLOAT *BETA, FLOAT *c, blasint *INCY){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; blasint lda = *LDA; blasint incx = *INCX; FLOAT beta_r = BETA[0]; FLOAT beta_i = BETA[1]; blasint incy = *INCY; int (*symv[])(BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { SYMV_U, SYMV_L, }; #ifdef SMP int (*symv_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { SYMV_THREAD_U, SYMV_THREAD_L, }; #endif blasint info; int uplo; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incy == 0) info = 10; if (incx == 0) info = 7; if (lda < MAX(1, n)) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } if (n == 0) return; if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0); if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) b -= (n - 1) * incx * COMPSIZE; if (incy < 0 ) c -= (n - 1) * incy * COMPSIZE; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (symv[uplo])(n, n, alpha_r, alpha_i, a, lda, b, incx, c, incy, buffer); #ifdef SMP } else { (symv_thread[uplo])(n, ALPHA, a, lda, b, incx, c, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + 2 * n, 2 * n * n); IDEBUG_END; return; }
int main(int argc, char *argv[]) { int i,j,k; char *ts1,*ts2; machineInformation currentMachine; counterSessionInfo session; double seconds = 0.0; // Set machine information from CounterHomeBrew.h currentMachine.cpu_model = CPU_MODEL; currentMachine.num_sockets = NUM_SOCKETS; currentMachine.num_phys_cores_per_socket = NUM_PHYS_CORES_PER_SOCKET; currentMachine.num_cores_per_socket = NUM_CORES_PER_SOCKET; currentMachine.num_cores = NUM_CORES; currentMachine.num_cbos = NUM_PHYS_CORES_PER_SOCKET; currentMachine.core_gen_counter_num_max = CORE_GEN_COUNTER_MAX; currentMachine.cbo_counter_num_max = CBO_COUNTER_NUM_MAX; // Set session events, umasks and counters used // NHM-EX session.core_gen_counter_num_used = 0; int32 core_event_numbers[] = {}; int32 core_umasks[] = {}; session.cbo_counter_num_used = 1; int32 cbo_event_numbers[] = {0x14}; int32 cbo_umasks[] = {0x7}; // JKT /* session.core_gen_counter_num_used = 5; int32 core_event_numbers[] = {0x10,0x10,0x11,0x51,0xF1}; int32 core_umasks[] = {0x80,0x10,0x02,0x01, 0x07}; session.cbo_counter_num_used = 1; int32 cbo_event_numbers[] = {0x37}; int32 cbo_umasks[] = {0xf}; session.cbo_filter = 0x1f; */ for (i = 0; i < session.core_gen_counter_num_used; i++) { session.core_event_numbers[i] = core_event_numbers[i]; session.core_umasks[i] = core_umasks[i]; } for (i = 0; i < session.cbo_counter_num_used; i++) { session.cbo_event_numbers[i] = cbo_event_numbers[i]; session.cbo_umasks[i] = cbo_umasks[i]; } int fd[NUM_CORES]; // Arrays to hold counter data... counterData before; counterData after; // some data for doing a naive matmul to test flop counting... // initloop(N); uint64 min_iters = 2; double minRuntime = 10.0; int M = atoi(argv[1]); int K = atoi(argv[2]); double *A = NULL; double *b = NULL; double *c = NULL; // posix_memalign((void**)A,64,M*K*sizeof(double)); // posix_memalign((void**)B,64,K*N*sizeof(double)); // posix_memalign((void**)C,64,M*N*sizeof(double)); A = (double*) malloc( M * K * sizeof(double) ); b = (double*) malloc( K * sizeof(double) ); c = (double*) malloc( M * sizeof(double) ); fill( A, M * K ); fill( b, K ); fill( c, M ); // open the msr files for each core on the machine for (i = 0; i < currentMachine.num_cores; i++) open_msr_file(i,&fd[i]); // warm up da caches... BLASFUNC( CblasColMajor,CblasNoTrans,M,K, 1, A,M, b,1, 1, c,1 ); // Program the counters!!! int socketsProgrammed = 0; for (i = 0; i < currentMachine.num_cores; i++) { int currentCoreFD = fd[i]; /* clear global control register before programming */ stopCounters(i, currentCoreFD, ¤tMachine, &session); /* set up the fixed counters on each core */ programCoreFixedCounters(currentCoreFD); /* set up the general purpose registers for each core */ programGeneralPurposeRegisters(currentCoreFD, ¤tMachine, &session); /* Program the Uncore as desired...*/ // Only program the first physical core on each socket. // NOTE: Some assumptions about topology here...check /proc/cpuinfo to confirm. #if CPU_MODEL == JAKETOWN if (i % currentMachine.num_phys_cores_per_socket == 0 && socketsProgrammed < currentMachine.num_sockets) #elif CPU_MODEL == NEHALEM_EX if (i < currentMachine.num_sockets && socketsProgrammed < currentMachine.num_sockets) #elif CPU_MODEL == IVY_BRIDGE if (i < currentMachine.num_sockets && socketsProgrammed < currentMachine.num_sockets) #endif { programUncoreCounters( currentCoreFD, ¤tMachine, &session); socketsProgrammed++; } /* set global control register to active counters */ // startCounters( i, currentCoreFD, ¤tMachine, &session); } uint64 num_iters; for (num_iters = min_iters; seconds < minRuntime; num_iters *=2) { if (num_iters != min_iters) { free(ts1); free(ts2); } sleep(5); seconds = 0.0; // start the programmed counters... for (i = 0; i < currentMachine.num_cores; i++) startCounters( i, fd[i], ¤tMachine, &session); /* READ COUNTERS BEFORE STUFF */ readCounters(fd,¤tMachine,&session, &before); ts1 = getTimeStamp(); seconds = read_timer(); /* DO STUFF */ for (i =0; i < num_iters; i++) BLASFUNC( CblasColMajor,CblasNoTrans,M,K, 1, A,M, b,1, 1, c,1 ); /* END DOING STUFF */ seconds = read_timer()-seconds; ts2 = getTimeStamp(); /* READ COUNTERS AFTER STUFF */ for (i = 0; i < currentMachine.num_cores; i++) stopCounters(i,fd[i],¤tMachine, &session); } num_iters /= 2; readCounters(fd,¤tMachine,&session,&after); diffCounterData(¤tMachine, &session, &after, &before, &after); uint64 *coreSums; coreSums = (uint64*)calloc(currentMachine.num_sockets*session.core_gen_counter_num_used,sizeof(uint64)); for (i = 0; i < currentMachine.num_sockets; i++) { for (j = 0; j < currentMachine.num_cores_per_socket; j++) { for (k = 0; k < session.core_gen_counter_num_used; k++) // coreSums[i*session.core_gen_counter_num_used + k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k]; coreSums[k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k]; } } uint64 *sums; sums = (uint64*)calloc(currentMachine.num_sockets*session.cbo_counter_num_used,sizeof(uint64)); for (i = 0; i < currentMachine.num_sockets; i++) { // printf("Socket %d\n",i); for (j = 0; j < currentMachine.num_cbos; j++) { // printf("%d,",j); for (k = 0; k < session.cbo_counter_num_used; k++) { // printf("%"PRIu64",",after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]); // bug in the indexing of the core sums??? // sums[i*session.cbo_counter_num_used + k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]; sums[k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]; } // printf("\n"); } } /* sums = (uint64*)calloc(currentMachine.num_sockets*session.cbo_counter_num_used,sizeof(uint64)); for (i = 0; i < currentMachine.num_sockets; i++) { for (j = 0; j < currentMachine.num_cbos; j++) { for (k = 0; k < session.cbo_counter_num_used; k++) // sums[i*session.cbo_counter_num_used + k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]; sums[k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]; } } */ // only print data from first socket and core printf("%s,%s,%"PRIu64",%d,%d,%d,%f,",ts1,ts2,num_iters,M,K,K,seconds/(double)num_iters); for (j = 0; j < session.core_gen_counter_num_used; j++) // printf("%"PRIu64",",after.generalCore[0][j]); printf("%f,",coreSums[j]/(double)num_iters); for (j = 0; j < session.cbo_counter_num_used; j++) printf("%f,",sums[j]/(double)num_iters); printf("\n"); free(sums); free(coreSums); // Stop counters, reset PMU, close msr files cleanup(fd,¤tMachine,&session); free(A); free(b); free(c); return 0; }
void NAME(char *SIDE, char *UPLO, blasint *M, blasint *N, FLOAT *alpha, FLOAT *a, blasint *ldA, FLOAT *b, blasint *ldB, FLOAT *beta, FLOAT *c, blasint *ldC){ char side_arg = *SIDE; char uplo_arg = *UPLO; blas_arg_t args; FLOAT *buffer; FLOAT *sa, *sb; #if defined(SMP) && !defined(NO_AFFINITY) int nodes; #endif blasint info; int side; int uplo; PRINT_DEBUG_NAME; args.alpha = (void *)alpha; args.beta = (void *)beta; TOUPPER(side_arg); TOUPPER(uplo_arg); side = -1; uplo = -1; if (side_arg == 'L') side = 0; if (side_arg == 'R') side = 1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; args.m = *M; args.n = *N; args.c = (void *)c; args.ldc = *ldC; info = 0; if (args.ldc < MAX(1, args.m)) info = 12; if (!side) { args.a = (void *)a; args.b = (void *)b; args.lda = *ldA; args.ldb = *ldB; if (args.ldb < MAX(1, args.m)) info = 9; if (args.lda < MAX(1, args.m)) info = 7; } else { args.a = (void *)b; args.b = (void *)a; args.lda = *ldB; args.ldb = *ldA; if (args.lda < MAX(1, args.m)) info = 9; if (args.ldb < MAX(1, args.n)) info = 7; } if (args.n < 0) info = 4; if (args.m < 0) info = 3; if (uplo < 0) info = 2; if (side < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint m, blasint n, #ifndef COMPLEX FLOAT alpha, #else FLOAT *alpha, #endif FLOAT *a, blasint lda, FLOAT *b, blasint ldb, #ifndef COMPLEX FLOAT beta, #else FLOAT *beta, #endif FLOAT *c, blasint ldc) { blas_arg_t args; int side, uplo; blasint info; FLOAT *buffer; FLOAT *sa, *sb; #if defined(SMP) && !defined(NO_AFFINITY) int nodes; #endif PRINT_DEBUG_CNAME; #ifndef COMPLEX args.alpha = (void *)α args.beta = (void *)β #else args.alpha = (void *)alpha; args.beta = (void *)beta; #endif args.c = (void *)c; args.ldc = ldc; side = -1; uplo = -1; info = 0; if (order == CblasColMajor) { if (Side == CblasLeft) side = 0; if (Side == CblasRight) side = 1; if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; args.m = m; args.n = n; if (args.ldc < MAX(1, args.m)) info = 12; if (!side) { args.a = (void *)a; args.b = (void *)b; args.lda = lda; args.ldb = ldb; if (args.ldb < MAX(1, args.m)) info = 9; if (args.lda < MAX(1, args.m)) info = 7; } else { args.a = (void *)b; args.b = (void *)a; args.lda = ldb; args.ldb = lda; if (args.lda < MAX(1, args.m)) info = 9; if (args.ldb < MAX(1, args.n)) info = 7; } if (args.n < 0) info = 4; if (args.m < 0) info = 3; if (uplo < 0) info = 2; if (side < 0) info = 1; } if (order == CblasRowMajor) { if (Side == CblasLeft) side = 1; if (Side == CblasRight) side = 0; if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; info = -1; args.m = n; args.n = m; if (args.ldc < MAX(1, args.m)) info = 12; if (!side) { args.a = (void *)a; args.b = (void *)b; args.lda = lda; args.ldb = ldb; if (args.ldb < MAX(1, args.m)) info = 9; if (args.lda < MAX(1, args.m)) info = 7; } else { args.a = (void *)b; args.b = (void *)a; args.lda = ldb; args.ldb = lda; if (args.lda < MAX(1, args.m)) info = 9; if (args.ldb < MAX(1, args.n)) info = 7; } if (args.n < 0) info = 4; if (args.m < 0) info = 3; if (uplo < 0) info = 2; if (side < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (args.m == 0 || args.n == 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); buffer = (FLOAT *)blas_memory_alloc(0); sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #ifdef SMP args.common = NULL; args.nthreads = num_cpu_avail(3); if (args.nthreads == 1) { #endif (symm[(side << 1) | uplo ])(&args, NULL, NULL, sa, sb, 0); #ifdef SMP } else { #ifndef NO_AFFINITY nodes = get_num_nodes(); if (nodes > 1) { args.nthreads /= nodes; gemm_thread_mn(MODE, &args, NULL, NULL, symm[4 | (side << 1) | uplo ], sa, sb, nodes); } else { #endif #ifndef USE_SIMPLE_THREADED_LEVEL3 (symm[4 | (side << 1) | uplo ])(&args, NULL, NULL, sa, sb, 0); #else GEMM_THREAD(MODE, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads); #endif #ifndef NO_AFFINITY } #endif } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, (!side)? args.m * (args.m / 2 + args.n) : args.n * (args.m + args.n / 2), (!side)? 2 * args.m * args.m * args.n : 2 * args.m * args.n * args.n); IDEBUG_END; return; }
void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, FLOAT *a, FLOAT *x, blasint *INCX){ char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; blasint n = *N; blasint incx = *INCX; blasint info; int uplo; int unit; int trans; FLOAT *buffer; PRINT_DEBUG_NAME; TOUPPER(uplo_arg); TOUPPER(trans_arg); TOUPPER(diag_arg); trans = -1; unit = -1; uplo = -1; if (trans_arg == 'N') trans = 0; if (trans_arg == 'T') trans = 1; if (trans_arg == 'R') trans = 0; if (trans_arg == 'C') trans = 1; if (diag_arg == 'U') unit = 0; if (diag_arg == 'N') unit = 1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incx == 0) info = 7; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint n, FLOAT *a, FLOAT *x, blasint incx) { int trans, uplo, unit; blasint info; FLOAT *buffer; PRINT_DEBUG_CNAME; unit = -1; uplo = -1; trans = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 0; if (TransA == CblasConjTrans) trans = 1; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 7; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 1; if (TransA == CblasConjTrans) trans = 0; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 7; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); (tpsv[(trans<<2) | (uplo<<1) | unit])(n, a, x, incx, buffer); blas_memory_free(buffer); FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); IDEBUG_END; return; }
void NAME(char *TRANS, blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char trans = *TRANS; blasint m = *M; blasint n = *N; blasint lda = *LDA; blasint incx = *INCX; blasint incy = *INCY; FLOAT *buffer; #ifdef SMP int nthreads; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { GEMV_N, GEMV_T, GEMV_R, GEMV_C, GEMV_O, GEMV_U, GEMV_S, GEMV_D, }; blasint info; blasint lenx, leny; blasint i; PRINT_DEBUG_NAME; FLOAT alpha_r = *(ALPHA + 0); FLOAT alpha_i = *(ALPHA + 1); FLOAT beta_r = *(BETA + 0); FLOAT beta_i = *(BETA + 1); TOUPPER(trans); info = 0; i = -1; if (trans == 'N') i = 0; if (trans == 'T') i = 1; if (trans == 'R') i = 2; if (trans == 'C') i = 3; if (trans == 'O') i = 4; if (trans == 'U') i = 5; if (trans == 'S') i = 6; if (trans == 'D') i = 7; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1,m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (i < 0) info = 1; trans = i; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasint n, FLOAT *ALPHA, FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT *BETA, FLOAT *y, blasint incy){ FLOAT *buffer; blasint lenx, leny; int trans; blasint info, t; #ifdef SMP int nthreads; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { GEMV_N, GEMV_T, GEMV_R, GEMV_C, GEMV_O, GEMV_U, GEMV_S, GEMV_D, }; PRINT_DEBUG_CNAME; FLOAT alpha_r = *(ALPHA + 0); FLOAT alpha_i = *(ALPHA + 1); FLOAT beta_r = *(BETA + 0); FLOAT beta_i = *(BETA + 1); trans = -1; info = 0; if (order == CblasColMajor) { if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 2; if (TransA == CblasConjTrans) trans = 3; info = -1; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1, m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (order == CblasRowMajor) { if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 3; if (TransA == CblasConjTrans) trans = 2; info = -1; t = n; n = m; m = t; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1, m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif /* Quick return if possible. */ if (m == 0 || n == 0) return; lenx = n; leny = m; if (trans & 1) lenx = m; if (trans & 1) leny = n; if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (lenx - 1) * incx * 2; if (incy < 0) y -= (leny - 1) * incy * 2; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (gemv[(int)trans])(m, n, 0, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); #ifdef SMP } else { (gemv_thread[(int)trans])(m, n, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); IDEBUG_END; return; }
float BLASFUNC(sdsdot)(int* n, float* alpha, float* x, int* incx, float* y, int* incy) { return *alpha + BLASFUNC(dsdot)(n, x, incx, y, incy); }
void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *a){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha = *ALPHA; blasint incx = *INCX; blasint info; int uplo; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a) { FLOAT *buffer; int uplo; blasint info; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; info = -1; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; if (alpha == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (spr[uplo])(n, alpha, x, incx, a, buffer); #ifdef SMP } else { (spr_thread[uplo])(n, alpha, x, incx, a, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); IDEBUG_END; return; }
void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, blasint *K, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; blasint n = *N; blasint k = *K; blasint lda = *LDA; blasint incx = *INCX; blasint info; int uplo; int unit; int trans; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); TOUPPER(trans_arg); TOUPPER(diag_arg); trans = -1; unit = -1; uplo = -1; if (trans_arg == 'N') trans = 0; if (trans_arg == 'T') trans = 1; if (trans_arg == 'R') trans = 0; if (trans_arg == 'C') trans = 1; if (diag_arg == 'U') unit = 0; if (diag_arg == 'N') unit = 1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incx == 0) info = 9; if (lda < k + 1) info = 7; if (k < 0) info = 5; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint n, blasint k, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { int trans, uplo, unit; blasint info; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; unit = -1; uplo = -1; trans = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 0; if (TransA == CblasConjTrans) trans = 1; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 9; if (lda < k + 1) info = 7; if (k < 0) info = 5; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 1; if (TransA == CblasConjTrans) trans = 0; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 9; if (lda < k + 1) info = 7; if (k < 0) info = 5; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (tbmv[(trans<<2) | (uplo<<1) | unit])(n, k, a, lda, x, incx, buffer); #ifdef SMP } else { (tbmv_thread[(trans<<2) | (uplo<<1) | unit])(n, k, a, lda, x, incx, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(1, n * k / 2 + n, n * k); IDEBUG_END; return; }
void NAME(char *TRANS, blasint *M, blasint *N, blasint *KU, blasint *KL, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char trans = *TRANS; blasint m = *M; blasint n = *N; blasint ku = *KU; blasint kl = *KL; blasint lda = *LDA; blasint incx = *INCX; blasint incy = *INCY; FLOAT *buffer; #ifdef SMP int nthreads; #endif FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; FLOAT beta_r = BETA[0]; FLOAT beta_i = BETA[1]; blasint info; blasint lenx, leny; blasint i; PRINT_DEBUG_NAME; TOUPPER(trans); info = 0; i = -1; if (trans == 'N') i = 0; if (trans == 'T') i = 1; if (trans == 'R') i = 2; if (trans == 'C') i = 3; if (trans == 'O') i = 4; if (trans == 'U') i = 5; if (trans == 'S') i = 6; if (trans == 'D') i = 7; if (incy == 0) info = 13; if (incx == 0) info = 10; if (lda < kl + ku + 1) info = 8; if (kl < 0) info = 5; if (ku < 0) info = 4; if (n < 0) info = 3; if (m < 0) info = 2; if (i < 0) info = 1; trans = i; if (info != 0){ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasint n, blasint ku, blasint kl, FLOAT *ALPHA, FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT *BETA, FLOAT *y, blasint incy){ FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; FLOAT beta_r = BETA[0]; FLOAT beta_i = BETA[1]; FLOAT *buffer; blasint lenx, leny; int trans; blasint info, t; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; trans = -1; info = 0; if (order == CblasColMajor) { if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 2; if (TransA == CblasConjTrans) trans = 3; info = -1; if (incy == 0) info = 13; if (incx == 0) info = 10; if (lda < kl + ku + 1) info = 8; if (kl < 0) info = 5; if (ku < 0) info = 4; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (order == CblasRowMajor) { if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 3; if (TransA == CblasConjTrans) trans = 2; info = -1; t = n; n = m; m = t; t = ku; ku = kl; kl = t; if (incy == 0) info = 13; if (incx == 0) info = 10; if (lda < kl + ku + 1) info = 8; if (kl < 0) info = 5; if (ku < 0) info = 4; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if ((m==0) || (n==0)) return; lenx = n; leny = m; if (trans & 1) lenx = m; if (trans & 1) leny = n; if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (lenx - 1) * incx * 2; if (incy < 0) y -= (leny - 1) * incy * 2; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (gbmv[(int)trans])(m, n, kl, ku, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); #ifdef SMP } else { (gbmv_thread[(int)trans])(m, n, kl, ku, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, m * n / 2 + n, m * n); IDEBUG_END; return; }
void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a, blasint *LDA){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; blasint lda = *LDA; blasint incx = *INCX; blasint incy = *INCY; blasint info; int uplo; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (lda < MAX(1, n)) info = 9; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) { FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; FLOAT *buffer; int uplo; blasint info; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; if (lda < MAX(1, n)) info = 9; if (incy == 0) info = 7; if (incx == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 3; if (Uplo == CblasLower) uplo = 2; info = -1; if (lda < MAX(1, n)) info = 9; if (incx == 0) info = 7; if (incy == 0) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx * 2; if (incy < 0 ) y -= (n - 1) * incy * 2; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (her2[uplo])(n, alpha_r, alpha_i, x, incx, y, incy, a, lda, buffer); #ifdef SMP } else { (her2_thread[uplo])(n, ALPHA, x, incx, y, incy, a, lda, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + 2 * n, 2 * n * n); IDEBUG_END; return; }
void NAME(char *UPLO, char *TRANS, char *DIAG, blasint *N, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ char uplo_arg = *UPLO; char trans_arg = *TRANS; char diag_arg = *DIAG; blasint n = *N; blasint lda = *LDA; blasint incx = *INCX; blasint info; int uplo; int unit; int trans, buffer_size; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); TOUPPER(trans_arg); TOUPPER(diag_arg); trans = -1; unit = -1; uplo = -1; if (trans_arg == 'N') trans = 0; if (trans_arg == 'T') trans = 1; if (trans_arg == 'R') trans = 2; if (trans_arg == 'C') trans = 3; if (diag_arg == 'U') unit = 0; if (diag_arg == 'N') unit = 1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incx == 0) info = 8; if (lda < MAX(1, n)) info = 6; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { int trans, uplo, unit, buffer_size; blasint info; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; unit = -1; uplo = -1; trans = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 2; if (TransA == CblasConjTrans) trans = 3; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 8; if (lda < MAX(1, n)) info = 6; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 3; if (TransA == CblasConjTrans) trans = 2; if (Diag == CblasUnit) unit = 0; if (Diag == CblasNonUnit) unit = 1; info = -1; if (incx == 0) info = 8; if (lda < MAX(1, n)) info = 6; if (n < 0) info = 4; if (unit < 0) info = 3; if (trans < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx * 2; #ifdef SMP // Calibrated on a Xeon E5-2630 if(1L * n * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) { nthreads = num_cpu_avail(2); if(nthreads > 2 && 1L * n * n < 64L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) nthreads = 2; } else nthreads = 1; if(nthreads > 1) { buffer_size = n > 16 ? 0 : n * 4 + 40; } else #endif { buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT); if(incx != 1) buffer_size += n * 2; } STACK_ALLOC(buffer_size, FLOAT, buffer); #ifdef SMP if (nthreads == 1) { #endif (trmv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); #ifdef SMP } else { (trmv_thread[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer, nthreads); } #endif STACK_FREE(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); IDEBUG_END; return; }
void NAME(char *TRANS, blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char trans = *TRANS; blasint m = *M; blasint n = *N; blasint lda = *LDA; blasint incx = *INCX; blasint incy = *INCY; FLOAT alpha = *ALPHA; FLOAT beta = *BETA; FLOAT *buffer; #ifdef SMP int nthreads; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { GEMV_N, GEMV_T, }; blasint info; blasint lenx, leny; blasint i; PRINT_DEBUG_NAME; TOUPPER(trans); info = 0; i = -1; if (trans == 'N') i = 0; if (trans == 'T') i = 1; if (trans == 'R') i = 0; if (trans == 'C') i = 1; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1, m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (i < 0) info = 1; trans = i; if (info != 0){ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasint n, FLOAT alpha, FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT beta, FLOAT *y, blasint incy){ FLOAT *buffer; blasint lenx, leny; int trans; blasint info, t; #ifdef SMP int nthreads; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { GEMV_N, GEMV_T, }; PRINT_DEBUG_CNAME; trans = -1; info = 0; if (order == CblasColMajor) { if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 0; if (TransA == CblasConjTrans) trans = 1; info = -1; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1, m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (order == CblasRowMajor) { if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 1; if (TransA == CblasConjTrans) trans = 0; info = -1; t = n; n = m; m = t; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1, m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if ((m==0) || (n==0)) return; lenx = n; leny = m; if (trans) lenx = m; if (trans) leny = n; if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (lenx - 1) * incx; if (incy < 0) y -= (leny - 1) * incy; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP int nthreads_max = num_cpu_avail(2); int nthreads_avail = nthreads_max; double MNK = (double) m * (double) n; if ( MNK <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) ) ) nthreads_max = 1; if ( nthreads_max > nthreads_avail ) nthreads = nthreads_avail; else nthreads = nthreads_max; if (nthreads == 1) { #endif (gemv[(int)trans])(m, n, 0, alpha, a, lda, x, incx, y, incy, buffer); #ifdef SMP } else { (gemv_thread[(int)trans])(m, n, alpha, a, lda, x, incx, y, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); IDEBUG_END; return; }
void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char uplo_arg = *UPLO; blasint n = *N; blasint k = *K; FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; blasint lda = *LDA; blasint incx = *INCX; FLOAT beta_r = BETA[0]; FLOAT beta_i = BETA[1]; blasint incy = *INCY; blasint info; int uplo; FLOAT *buffer; #ifdef SMPBUG int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; if (uplo_arg == 'V') uplo = 2; if (uplo_arg == 'M') uplo = 3; info = 0; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < k + 1) info = 6; if (k < 0) info = 3; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, blasint k, FLOAT *ALPHA, FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT *BETA, FLOAT *y, blasint incy){ FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; FLOAT beta_r = BETA[0]; FLOAT beta_i = BETA[1]; FLOAT *buffer; int uplo; blasint info; #ifdef SMPBUG int nthreads; #endif PRINT_DEBUG_CNAME; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < k + 1) info = 6; if (k < 0) info = 3; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 3; if (Uplo == CblasLower) uplo = 2; info = -1; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < k + 1) info = 6; if (k < 0) info = 3; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx * COMPSIZE; if (incy < 0 ) y -= (n - 1) * incy * COMPSIZE; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMPBUG nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (hbmv[uplo])(n, k, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); #ifdef SMPBUG } else { (hbmv_thread[uplo])(n, k, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * k / 2 + n, n * k); IDEBUG_END; return; }