static void assignKargs(KernelArg *args, const void *params, const void*) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; #ifdef DEBUG_SYMM printf("SAlpha=%f, DAlpha=%f, CAlpha =<%f, %f>, DAlpha=<%f, %f>\n", blasArgs->alpha.argFloat, blasArgs->alpha.argDouble, CREAL(blasArgs->alpha.argFloatComplex), CIMAG(blasArgs->alpha.argFloatComplex), CREAL(blasArgs->alpha.argDoubleComplex) , CIMAG(blasArgs->alpha.argDoubleComplex)); printf("SBeta=%f, DBeta=%f, CBeta=<%f, %f>, DBeta=<%f, %f>\n", blasArgs->beta.argFloat, blasArgs->beta.argDouble, CREAL(blasArgs->beta.argFloatComplex), CIMAG(blasArgs->beta.argFloatComplex), CREAL(blasArgs->beta.argDoubleComplex) , CIMAG(blasArgs->beta.argDoubleComplex)); #endif INIT_KARG(&args[0], blasArgs->A); //A - input matrix - argument INIT_KARG(&args[1], blasArgs->B); INIT_KARG(&args[2], blasArgs->C); initSizeKarg(&args[3], blasArgs->M); initSizeKarg(&args[4], blasArgs->N); initSizeKarg(&args[5], blasArgs->lda.matrix); initSizeKarg(&args[6], blasArgs->ldb.matrix); initSizeKarg(&args[7], blasArgs->ldc.matrix); initSizeKarg(&args[8], blasArgs->offa); //PENDING: offA or offa ?? initSizeKarg(&args[9], blasArgs->offBX); initSizeKarg(&args[10], blasArgs->offCY); assignScalarKarg(&args[11], &(blasArgs->alpha), blasArgs->dtype); assignScalarKarg(&args[12], &(blasArgs->beta), blasArgs->dtype); return; }
//xHEMM void printTestParams( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, bool useAlpha, cl_float2 alpha, bool useBeta, cl_float2 beta, size_t lda, size_t ldb, size_t ldc, size_t offa, size_t offb, size_t offc ) { ::std::cerr << orderStr(order) << ", " << sideStr(side) << ", " << uploStr(uplo) << ::std::endl; ::std::cerr << "M = " << M << ", N = " << N << ::std::endl; ::std::cerr << "lda = " << lda << ", ldb = " << ldb << ", ldc = " << ldc<< ::std::endl; if (useAlpha) { ::std::cerr << "alpha = (" << CREAL(alpha) << "," << CIMAG(alpha) << ")" << ::std::endl; } if (useBeta) { ::std::cerr << "beta = (" << CREAL(beta) << "," << CIMAG(beta) << ")" << ::std::endl; } ::std::cerr << "offa = " << offa << ", offb = " << offb << ", offc = " << offc<< ::std::endl; }
clblasStatus clblasZdscal( size_t N, double alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; DoubleComplex fAlpha; #ifdef DEBUG_SSCAL printf("\nZDSCAL Called\n"); #endif CREAL(fAlpha) = alpha; CIMAG(fAlpha) = 0.0f; memset(&kargs, 0, sizeof(kargs)); kargs.alpha.argDoubleComplex = fAlpha; kargs.dtype = TYPE_COMPLEX_DOUBLE; return doScal(&kargs, N, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); }
clblasStatus clblasZher2k( clblasOrder order, clblasUplo uplo, clblasTranspose trans, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_double beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; DoubleComplex fBeta; memset(&kargs, 0, sizeof(kargs)); CREAL(fBeta) = beta; CIMAG(fBeta) = 0.0f; kargs.alpha.argDoubleComplex = alpha; kargs.beta.argDoubleComplex = fBeta; kargs.dtype = TYPE_COMPLEX_DOUBLE; if( order == clblasRowMajor ) { CIMAG( kargs.alpha.argDoubleComplex ) *= -1.0; } return doHer2k(&kargs, order, uplo, trans, N, K, A, offa, lda, B, offb, ldb, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); }
static void printResult(void) { size_t i; printf("\nResult:\n"); for (i = 0; i < N; i++) { printf("(%f, %-f) \n", CREAL(X[i]), CIMAG(X[i])); } }
clblasStatus clblasZtbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; DoubleComplex alpha, beta; #ifdef DEBUG_TBSV printf("ZTBSV Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.pigFuncID = CLBLAS_TBSV; CREAL(alpha) = -1.0; CIMAG(alpha) = 0.0; CREAL(beta) = 1.0; CIMAG(beta) = 0.0; kargs.alpha.argDoubleComplex = alpha; kargs.beta.argDoubleComplex = beta; return doTbsv(&kargs, order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); }
static void printResult(void) { size_t i, j; printf("\nResult:\n"); for (i = 0; i < N; i++) { for(j = 0; j < N; j++) printf("(%9.2lf, %-9.2lf)\t", CREAL( A[ i*N + j ] ), CIMAG( A[ i*N + j ] )); printf("\n"); } }
//https://github.com/xianyi/OpenBLAS/issues/695 CTEST(potrf, bug_695){ openblas_complex_float A1[100] = {5.8525753+0.0*I, -0.79540455-0.7066077*I, 0.98274714-1.3824869*I, 2.619998-1.8532984*I, -1.8306153+1.2336911*I, 0.32275113-0.015575029*I, 2.1968813-1.0640624*I, 0.27894387-0.97911835*I, 3.0476584-0.18548489*I, 0.3842994-0.7050991*I, -0.79540455+0.7066077*I, 8.313246+0.0*I, -1.8076122+0.8882447*I, 0.47806996-0.48494184*I, 0.5096429+0.5395974*I, -0.7285097+0.10360408*I, -1.1760061+2.7146957*I, -0.4271084-0.042899966*I, -1.7228563-2.8335886*I, 1.8942566-0.6389735*I, 0.98274714+1.3824869*I, -1.8076122-0.8882447*I, 9.367975+0.0*I, -0.1838578-0.6468568*I, -1.8338387-0.7064959*I, 0.041852742+0.6556877*I, 2.5673025-1.9732997*I, -1.1148382+0.15693812*I, 2.4704504+1.0389464*I, 1.0858271+1.298006*I, 2.619998+1.8532984*I, 0.47806996+0.48494184*I, -0.1838578+0.6468568*I, 3.1117508+0.0*I, -1.956626-0.22825956*I, 0.07081801+0.31801307*I, 0.3698375+0.5400855*I, 0.80686307-1.5315914*I, 1.5649154+1.6229297*I, -0.112077385-1.2014246*I, -1.8306153-1.2336911*I, 0.5096429-0.5395974*I, -1.8338387+0.7064959*I, -1.956626+0.22825956*I, 3.6439795+0.0*I, -0.2594722-0.48786148*I, -0.47636223+0.27821827*I, -0.61608654+2.01858*I, -2.7767487-1.7693765*I, 0.048102796+0.9741874*I, 0.32275113+0.015575029*I, -0.7285097-0.10360408*I, 0.041852742-0.6556877*I, 0.07081801-0.31801307*I, -0.2594722+0.48786148*I, 3.624376+0.0*I, -1.6697118-0.4017511*I, -1.4397877+0.7550918*I, -0.31456697+1.0403451*I, -0.31978557-0.13701046*I, 2.1968813+1.0640624*I, -1.1760061-2.7146957*I, 2.5673025+1.9732997*I, 0.3698375-0.5400855*I, -0.47636223-0.27821827*I, -1.6697118+0.4017511*I, 6.8273163+0.0*I, -0.10051322-0.24303961*I, 1.4415971-0.29750675*I, 1.221786+0.85654986*I, 0.27894387+0.97911835*I, -0.4271084+0.042899966*I, -1.1148382-0.15693812*I, 0.80686307+1.5315914*I, -0.61608654-2.01858*I, -1.4397877-0.7550918*I, -0.10051322+0.24303961*I, 3.4057708+0.0*I, -0.5856801+1.0203559*I, 0.7103452-0.8422135*I, 3.0476584+0.18548489*I, -1.7228563+2.8335886*I, 2.4704504-1.0389464*I, 1.5649154-1.6229297*I, -2.7767487+1.7693765*I, -0.31456697-1.0403451*I, 1.4415971+0.29750675*I, -0.5856801-1.0203559*I, 7.005772+0.0*I, -0.9617417+1.2486815*I, 0.3842994+0.7050991*I, 1.8942566+0.6389735*I, 1.0858271-1.298006*I, -0.112077385+1.2014246*I, 0.048102796-0.9741874*I, -0.31978557+0.13701046*I, 1.221786-0.85654986*I, 0.7103452+0.8422135*I, -0.9617417-1.2486815*I, 3.4629636+0.0*I}; char up = 'U'; blasint n=10; blasint info[1]; BLASFUNC(cpotrf)(&up, &n, (float*)(A1), &n, info); //printf("%g+%g*I\n", creal(A1[91]), cimag(A1[91])); openblas_complex_double A2[100] = {3.0607147216796875+0.0*I, -0.5905849933624268-0.29020825028419495*I, 0.321084201335907+0.45168760418891907*I, 0.8387917876243591-0.644718587398529*I, -0.3642411530017853+0.051274992525577545*I, 0.8071482181549072+0.33944568037986755*I, 0.013674172572791576+0.21422699093818665*I, 0.35476258397102356+0.42408594489097595*I, -0.5991537570953369-0.23082709312438965*I, -0.0600702166557312-0.2113417387008667*I, -0.7954045534133911+0.7066076993942261*I, 2.807175397872925+0.0*I, -0.1691000759601593+0.313548743724823*I, -0.30911174416542053+0.7447023987770081*I, -0.22347848117351532+0.03316075727343559*I, -0.4088296890258789-1.0214389562606812*I, -0.2344931811094284+0.08056317269802094*I, 0.793269693851471-0.17507623136043549*I, 0.03163455054163933+0.20559945702552795*I, 0.13581633567810059-0.2110036462545395*I, 0.9827471375465393+1.3824869394302368*I, -1.8076121807098389-0.8882446885108948*I, 2.3277781009674072+0.0*I, 0.830405056476593-0.19296252727508545*I, 0.1394239068031311-0.5260677933692932*I, 1.239942193031311-0.09915469586849213*I, 0.06731037050485611-0.059320636093616486*I, 0.11507681757211685-0.1984301060438156*I, -0.6843825578689575+0.4647614359855652*I, 1.213119387626648-0.7757048010826111*I, 2.619997978210449+1.8532984256744385*I, 0.4780699610710144+0.48494184017181396*I, -0.18385779857635498+0.6468567848205566*I, 2.0811400413513184+0.0*I, -0.035075582563877106+0.09732913225889206*I, 0.27337002754211426-0.9032229781150818*I, -0.8374675512313843+0.0479498989880085*I, 0.6916252374649048+0.45711082220077515*I, 0.1883818507194519+0.06482727080583572*I, -0.32384994626045227+0.05857187137007713*I, -1.8306152820587158-1.2336910963058472*I, 0.5096428990364075-0.5395973920822144*I, -1.833838701248169+0.7064958810806274*I, -1.956626057624817+0.22825956344604492*I, 1.706615924835205+0.0*I, -0.2895336151123047+0.17579378187656403*I, -0.923172116279602-0.4530014097690582*I, 0.5040621757507324-0.37026339769363403*I, -0.2824432849884033-1.0374568700790405*I, 0.1399831622838974+0.4977008104324341*I, 0.32275113463401794+0.015575028955936432*I, -0.7285097241401672-0.10360407829284668*I, 0.041852742433547974-0.655687689781189*I, 0.07081800699234009-0.318013072013855*I, -0.25947219133377075+0.4878614842891693*I, 1.5735365152359009+0.0*I, -0.2647853195667267-0.26654252409935*I, -0.6190430521965027-0.24699924886226654*I, -0.6288471221923828+0.48154571652412415*I, 0.02446540631353855-0.2611822783946991*I, 2.1968812942504883+1.0640623569488525*I, -1.1760060787200928-2.714695692062378*I, 2.5673024654388428+1.9732997417449951*I, 0.3698374927043915-0.54008549451828*I, -0.4763622283935547-0.27821826934814453*I, -1.6697118282318115+0.4017511010169983*I, 1.2674795389175415+0.0*I, 0.3079095482826233-0.07258892804384232*I, -0.5929520130157471-0.038360968232154846*I, 0.04388086497783661-0.025549031794071198*I, 0.27894386649131775+0.9791183471679688*I, -0.42710840702056885+0.0428999662399292*I, -1.1148382425308228-0.1569381207227707*I, 0.8068630695343018+1.5315914154052734*I, -0.6160865426063538-2.0185799598693848*I, -1.439787745475769-0.7550917863845825*I, -0.10051321983337402+0.24303960800170898*I, 0.9066106081008911+0.0*I, 0.05315789580345154-0.06136537343263626*I, -0.21304509043693542+0.6494344472885132*I, 3.0476584434509277+0.1854848861694336*I, -1.7228562831878662+2.8335886001586914*I, 2.4704504013061523-1.0389463901519775*I, 1.564915418624878-1.6229296922683716*I, -2.7767486572265625+1.769376516342163*I, -0.314566969871521-1.0403450727462769*I, 1.4415971040725708+0.29750674962997437*I, -0.5856801271438599-1.0203559398651123*I, 0.5668219923973083+0.0*I, 0.033351436257362366-0.07832501083612442*I, 0.3842993974685669+0.7050991058349609*I, 1.894256591796875+0.6389734745025635*I, 1.085827112197876-1.2980060577392578*I, -0.11207738518714905+1.2014245986938477*I, 0.04810279607772827-0.9741873741149902*I, -0.31978556513786316+0.13701045513153076*I, 1.2217860221862793-0.856549859046936*I, 0.7103452086448669+0.84221351146698*I, -0.9617416858673096-1.2486815452575684*I, 0.0756804421544075+0.0*I}; openblas_complex_double B[20] = {-0.21782716937787788-0.9222220085490986*I, -0.7620356655676837+0.15533508334193666*I, -0.905011814118756+0.2847570854574069*I, -0.3451346708401685+1.076948486041297*I, 0.25336108035924787+0.975317836492159*I, 0.11192755545114-0.1603741874112385*I, -0.20604111555491242+0.10570814584017311*I, -1.0568488936791578-0.06025820467086475*I, -0.6650468984506477-0.5000967284800251*I, -1.0509472322215125+0.5022165705328413*I, -0.727775859267237+0.50638268521728*I, 0.39947219167701153-0.4576746001199889*I, -0.7122162951294634-0.630289556702497*I, 0.9870834574024372-0.2825689605519449*I, 0.0628393808469436-0.1253397353973715*I, 0.8439562576196216+1.0850814110398734*I, 0.562377322638969-0.2578030745663871*I, 0.12696236014017806-0.09853584666755086*I, -0.023682508769195098+0.18093440285319276*I, -0.7264975746431271+0.31670415674097235*I}; char lo = 'L'; blasint nrhs = 2; BLASFUNC(zpotrs)(&lo, &n, &nrhs, (double*)(A2), &n, (double*)(B), &n, info); // note that this is exactly equal to A1 openblas_complex_float A3[100] = {5.8525753+0.0*I, -0.79540455-0.7066077*I, 0.98274714-1.3824869*I, 2.619998-1.8532984*I, -1.8306153+1.2336911*I, 0.32275113-0.015575029*I, 2.1968813-1.0640624*I, 0.27894387-0.97911835*I, 3.0476584-0.18548489*I, 0.3842994-0.7050991*I, -0.79540455+0.7066077*I, 8.313246+0.0*I, -1.8076122+0.8882447*I, 0.47806996-0.48494184*I, 0.5096429+0.5395974*I, -0.7285097+0.10360408*I, -1.1760061+2.7146957*I, -0.4271084-0.042899966*I, -1.7228563-2.8335886*I, 1.8942566-0.6389735*I, 0.98274714+1.3824869*I, -1.8076122-0.8882447*I, 9.367975+0.0*I, -0.1838578-0.6468568*I, -1.8338387-0.7064959*I, 0.041852742+0.6556877*I, 2.5673025-1.9732997*I, -1.1148382+0.15693812*I, 2.4704504+1.0389464*I, 1.0858271+1.298006*I, 2.619998+1.8532984*I, 0.47806996+0.48494184*I, -0.1838578+0.6468568*I, 3.1117508+0.0*I, -1.956626-0.22825956*I, 0.07081801+0.31801307*I, 0.3698375+0.5400855*I, 0.80686307-1.5315914*I, 1.5649154+1.6229297*I, -0.112077385-1.2014246*I, -1.8306153-1.2336911*I, 0.5096429-0.5395974*I, -1.8338387+0.7064959*I, -1.956626+0.22825956*I, 3.6439795+0.0*I, -0.2594722-0.48786148*I, -0.47636223+0.27821827*I, -0.61608654+2.01858*I, -2.7767487-1.7693765*I, 0.048102796+0.9741874*I, 0.32275113+0.015575029*I, -0.7285097-0.10360408*I, 0.041852742-0.6556877*I, 0.07081801-0.31801307*I, -0.2594722+0.48786148*I, 3.624376+0.0*I, -1.6697118-0.4017511*I, -1.4397877+0.7550918*I, -0.31456697+1.0403451*I, -0.31978557-0.13701046*I, 2.1968813+1.0640624*I, -1.1760061-2.7146957*I, 2.5673025+1.9732997*I, 0.3698375-0.5400855*I, -0.47636223-0.27821827*I, -1.6697118+0.4017511*I, 6.8273163+0.0*I, -0.10051322-0.24303961*I, 1.4415971-0.29750675*I, 1.221786+0.85654986*I, 0.27894387+0.97911835*I, -0.4271084+0.042899966*I, -1.1148382-0.15693812*I, 0.80686307+1.5315914*I, -0.61608654-2.01858*I, -1.4397877-0.7550918*I, -0.10051322+0.24303961*I, 3.4057708+0.0*I, -0.5856801+1.0203559*I, 0.7103452-0.8422135*I, 3.0476584+0.18548489*I, -1.7228563+2.8335886*I, 2.4704504-1.0389464*I, 1.5649154-1.6229297*I, -2.7767487+1.7693765*I, -0.31456697-1.0403451*I, 1.4415971+0.29750675*I, -0.5856801-1.0203559*I, 7.005772+0.0*I, -0.9617417+1.2486815*I, 0.3842994+0.7050991*I, 1.8942566+0.6389735*I, 1.0858271-1.298006*I, -0.112077385+1.2014246*I, 0.048102796-0.9741874*I, -0.31978557+0.13701046*I, 1.221786-0.85654986*I, 0.7103452+0.8422135*I, -0.9617417-1.2486815*I, 3.4629636+0.0*I}; BLASFUNC(cpotrf)(&up, &n, (float*)(A3), &n, info); // printf("%g+%g*I\n", creal(A3[91]), cimag(A3[91])); if(isnan(CREAL(A3[91])) || isnan(CIMAG(A3[91]))) { CTEST_ERR("%s:%d got NaN", __FILE__, __LINE__); } }
static void printResult(void) { size_t i, nElements; printf("Result:\n"); nElements = (sizeof(Y) / sizeof(cl_double2)) / incy; for (i = 0; i < nElements; i++) { printf("(%9.2f, %-9.2f)\n", CREAL(Y[i * incy]), CIMAG(Y[i * incy])); } }
static void printResult(void) { size_t i, j; printf("Result:\n"); for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { printf("(%9.2f, %-9.2f) ", CREAL(C[i + j * ldc]), CIMAG(C[i + j * ldc])); } printf("\n"); } }
void write_output(imf_list_t list,mxArray *plhs[]) { double *rout,*iout,*out2; imf_t *current; int i=0,j,m=list.m,n=list.n; plhs[0]=mxCreateDoubleMatrix(m,n,mxCOMPLEX); rout=mxGetPr(plhs[0]); iout=mxGetPi(plhs[0]); plhs[1]=mxCreateDoubleMatrix(1,m-1,mxCOMPLEX); out2=mxGetPr(plhs[1]); for (current=list.first;current;current=current->next) { for (j=0;j<n;j++) { *(rout+j*m+i)=CREAL(current->pointer[j]); *(iout+j*m+i)=CIMAG(current->pointer[j]); } if (i<m-1) *(out2+i)=current->nb_iterations; i++; } }
template <typename ElemType> nano_time_t Her2kPerformanceTest<ElemType>::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; clblasUplo fUplo; clblasTranspose fTransA; ElemType fAlpha; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif memcpy(C_, backC_, params_.rowsC * params_.columnsC * sizeof(ElemType)); order = params_.order; fUplo = params_.uplo; fTransA = params_.transA; fAlpha = alpha_; if (order != clblasColumnMajor) { CIMAG( fAlpha ) *= -1.0; fTransA = (params_.transA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans; fUplo = (params_.uplo == clblasUpper) ? clblasLower : clblasUpper; } #ifdef PERF_TEST_WITH_ACML time = getCurrentTime(); clMath::blas::her2k(clblasColumnMajor, fUplo, fTransA, params_.N, params_.K, fAlpha, A_, 0, params_.lda, B_, 0, params_.ldb, CREAL( beta_), C_, 0, params_.ldc); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; }
static void printResult(void) { size_t i, j, off; printf("\nResult:\n"); off = 0; for (i = 0; i < N; i++) { for(j = 0; j < N; j++) { if( ( (uplo == clblasUpper) && (i > j)) || ((uplo == clblasLower) && (j > i)) ) { printf("\t\t\t"); continue; } printf("(%9.2lf, %-9.2lf)\t", CREAL( AP[ off ] ), CIMAG( AP[ off ] )); off ++ ; } printf("\n"); } }
//HER2 void printTestParams( clblasOrder order, clblasUplo uplo, size_t N, bool useAlpha, cl_float2 alpha, size_t offx, int incx, size_t offy, int incy, size_t offa, size_t lda) { ::std::cerr << orderStr(order) << ", " << uploStr(uplo) << ::std::endl; ::std::cerr << "N = " << N << ", offx = " << offx << ", incx = " << incx << ::std::endl; ::std::cerr << "offy = " << offy << ", incy = " << incy << ::std::endl; ::std::cerr << "offa = " << offa << ::std::endl; if( lda ) ::std::cerr << ", lda = " << lda << ::std::endl; if(useAlpha) ::std::cerr << "alpha = (" << CREAL(alpha) << ", " << CIMAG(alpha) << ")" << ::std::endl; }
int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *buffer) { BLASLONG i, is, min_i; #if (TRANSA == 2) || (TRANSA == 4) OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; #endif FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } for (is =0; is < m; is += DTB_ENTRIES) { min_i = MIN(m - is, DTB_ENTRIES); #if (TRANSA) == 1 || (TRANSA == 3) if (is > 0) { #if TRANSA == 1 GEMV_N(is, min_i, 0, dp1, ZERO, a + is * lda * 2, lda, B + is * 2, 1, B, 1, gemvbuffer); #else GEMV_R(is, min_i, 0, dp1, ZERO, a + is * lda * 2, lda, B + is * 2, 1, B, 1, gemvbuffer); #endif } #endif for (i = 0; i < min_i; i++) { FLOAT *AA = a + (is + (i + is) * lda) * 2; FLOAT *BB = B + is * 2; #if (TRANSA == 1) || (TRANSA == 3) #if TRANSA == 1 if (i > 0) AXPYU_K (i, 0, 0, BB[i * 2 + 0], BB[i * 2 + 1], AA, 1, BB, 1, NULL, 0); #else if (i > 0) AXPYC_K(i, 0, 0, BB[i * 2 + 0], BB[i * 2 + 1], AA, 1, BB, 1, NULL, 0); #endif #endif #ifndef UNIT atemp1 = AA[i * 2 + 0]; atemp2 = AA[i * 2 + 1]; btemp1 = BB[i * 2 + 0]; btemp2 = BB[i * 2 + 1]; #if (TRANSA == 1) || (TRANSA == 2) BB[i * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2; BB[i * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1; #else BB[i * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2; BB[i * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1; #endif #endif #if (TRANSA == 2) || (TRANSA == 4) if (i < min_i - 1) { #if TRANSA == 2 temp = DOTU_K(min_i - i - 1, AA + (i + 1) * 2, 1, BB + (i + 1) * 2, 1); #else temp = DOTC_K(min_i - i - 1, AA + (i + 1) * 2, 1, BB + (i + 1) * 2, 1); #endif BB[i * 2 + 0] += CREAL(temp); BB[i * 2 + 1] += CIMAG(temp); } #endif } #if (TRANSA) == 2 || (TRANSA == 4) if (m - is > min_i) { #if TRANSA == 2 GEMV_T(m - is - min_i, min_i, 0, dp1, ZERO, a + (is + min_i + is * lda) * 2, lda, B + (is + min_i) * 2, 1, B + is * 2, 1, gemvbuffer); #else GEMV_C(m - is - min_i, min_i, 0, dp1, ZERO, a + (is + min_i + is * lda) * 2, lda, B + (is + min_i) * 2, 1, B + is * 2, 1, gemvbuffer); #endif } #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; }
void fastsum_benchomp_createdataset(unsigned int d, int L, int M) { int t, j, k; R *x; R *y; C *alpha; x = (R*) NFFT(malloc)((size_t)(d * L) * sizeof(R)); y = (R*) NFFT(malloc)((size_t)(d * L) * sizeof(R)); alpha = (C*) NFFT(malloc)((size_t)(L) * sizeof(C)); /** init source knots in a d-ball with radius 1 */ k = 0; while (k < L) { R r_max = K(1.0); R r2 = K(0.0); for (j = 0; j < d; j++) x[k * d + j] = K(2.0) * r_max * NFFT(drand48)() - r_max; for (j = 0; j < d; j++) r2 += x[k * d + j] * x[k * d + j]; if (r2 >= r_max * r_max) continue; k++; } NFFT(vrand_unit_complex)(alpha, L); /** init target knots in a d-ball with radius 1 */ k = 0; while (k < M) { R r_max = K(1.0); R r2 = K(0.0); for (j = 0; j < d; j++) y[k * d + j] = K(2.0) * r_max * NFFT(drand48)() - r_max; for (j = 0; j < d; j++) r2 += y[k * d + j] * y[k * d + j]; if (r2 >= r_max * r_max) continue; k++; } printf("%d %d %d\n", d, L, M); for (j = 0; j < L; j++) { for (t = 0; t < d; t++) printf("%.16" __FES__ " ", x[d * j + t]); printf("\n"); } for (j = 0; j < L; j++) printf("%.16" __FES__ " %.16" __FES__ "\n", CREAL(alpha[j]), CIMAG(alpha[j])); for (j = 0; j < M; j++) { for (t = 0; t < d; t++) printf("%.16" __FES__ " ", y[d * j + t]); printf("\n"); } NFFT(free)(x); NFFT(free)(y); NFFT(free)(alpha); }
int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; #if (TRANSA == 2) || (TRANSA == 4) FLOAT _Complex temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; #endif FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } a += (m + 1) * m - 2; for (i = 0; i < m; i++) { #if (TRANSA == 1) || (TRANSA == 3) #if TRANSA == 1 if (i > 0) AXPYU_K (i, 0, 0, B[(m - i - 1) * 2 + 0], B[(m - i - 1) * 2 + 1], a + 2, 1, B + (m - i) * 2, 1, NULL, 0); #else if (i > 0) AXPYC_K(i, 0, 0, B[(m - i - 1) * 2 + 0], B[(m - i - 1) * 2 + 1], a + 2, 1, B + (m - i) * 2, 1, NULL, 0); #endif #endif #ifndef UNIT atemp1 = a[0]; atemp2 = a[1]; btemp1 = B[(m - i - 1) * 2 + 0]; btemp2 = B[(m - i - 1) * 2 + 1]; #if (TRANSA == 1) || (TRANSA == 2) B[(m - i - 1) * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2; B[(m - i - 1) * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1; #else B[(m - i - 1) * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2; B[(m - i - 1) * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1; #endif #endif #if (TRANSA == 2) || (TRANSA == 4) if (i < m - 1) { #if TRANSA == 2 temp = DOTU_K(m - i - 1, a - (m - i - 1) * 2, 1, B, 1); #else temp = DOTC_K(m - i - 1, a - (m - i - 1) * 2, 1, B, 1); #endif B[(m - i - 1) * 2 + 0] += CREAL(temp); B[(m - i - 1) * 2 + 1] += CIMAG(temp); } #endif #if (TRANSA == 1) || (TRANSA == 3) a -= (i + 2) * 2; #else a -= (m - i) * 2; #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; }
int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ BLASLONG i; FLOAT *X = x; FLOAT *Y = y; FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *bufferY = gemvbuffer; FLOAT *bufferX = gemvbuffer; FLOAT temp[2]; if (incy != 1) { Y = bufferY; bufferX = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) * 2 + 4095) & ~4095); gemvbuffer = bufferX; COPY_K(m, y, incy, Y, 1); } if (incx != 1) { X = bufferX; gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + m * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(m, x, incx, X, 1); } for (i = 0; i < m; i++) { #ifndef HEMVREV #ifndef LOWER if (i > 0) { FLOAT _Complex result = DOTC_K(i, a, 1, X, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } temp[0] = a[i * 2 + 0] * X[i * 2 + 0]; temp[1] = a[i * 2 + 0] * X[i * 2 + 1]; Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (i > 0) { AXPYU_K(i, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a, 1, Y, 1, NULL, 0); } a += (i + 1) * 2; #else if (m - i > 1) { FLOAT _Complex result = DOTC_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } temp[0] = a[i * 2 + 0] * X[i * 2 + 0]; temp[1] = a[i * 2 + 0] * X[i * 2 + 1]; Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (m - i > 1) { AXPYU_K(m - i - 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0); } a += (m - i - 1) * 2; #endif #else #ifndef LOWER if (i > 0) { FLOAT _Complex result = DOTU_K(i, a, 1, X, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } temp[0] = a[i * 2 + 0] * X[i * 2 + 0]; temp[1] = a[i * 2 + 0] * X[i * 2 + 1]; Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (i > 0) { AXPYC_K(i, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a, 1, Y, 1, NULL, 0); } a += (i + 1) * 2; #else if (m - i > 1) { FLOAT _Complex result = DOTU_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } temp[0] = a[i * 2 + 0] * X[i * 2 + 0]; temp[1] = a[i * 2 + 0] * X[i * 2 + 1]; Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; if (m - i > 1) { AXPYC_K(m - i - 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0); } a += (m - i - 1) * 2; #endif #endif } if (incy != 1) { COPY_K(m, Y, 1, y, incy); } return 0; }
static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG lda, incx; BLASLONG n, k, n_from, n_to; BLASLONG i, length; #ifndef COMPLEX FLOAT result; #else FLOAT _Complex result; #endif a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; lda = args -> lda; incx = args -> ldb; n = args -> n; k = args -> k; n_from = 0; n_to = n; //Use y as each thread's n* COMPSIZE elements in sb buffer y = buffer; buffer += ((COMPSIZE * n + 1023) & ~1023); if (range_m) { n_from = *(range_m + 0); n_to = *(range_m + 1); a += n_from * lda * COMPSIZE; } if (incx != 1) { COPY_K(n, x, incx, buffer, 1); x = buffer; buffer += ((COMPSIZE * n + 1023) & ~1023); } SCAL_K(n, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y, 1, NULL, 0, NULL, 0); for (i = n_from; i < n_to; i++) { #ifndef LOWER length = i; if (length > k) length = k; MYAXPY(length, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + (k - length) * COMPSIZE, 1, y + (i - length) * COMPSIZE, 1, NULL, 0); #if !defined(HEMV) && !defined(HEMVREV) result = MYDOT(length + 1, a + (k - length) * COMPSIZE, 1, x + (i - length) * COMPSIZE, 1); #else result = MYDOT(length , a + (k - length) * COMPSIZE, 1, x + (i - length) * COMPSIZE, 1); #endif #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else #if !defined(HEMV) && !defined(HEMVREV) *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #else *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + k * COMPSIZE) * *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + k * COMPSIZE) * *(x + i * COMPSIZE + 1); #endif #endif #else length = k; if (n - i - 1 < k) length = n - i - 1; MYAXPY(length, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); #if !defined(HEMV) && !defined(HEMVREV) result = MYDOT(length + 1, a, 1, x + i * COMPSIZE, 1); #else result = MYDOT(length , a + COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1) ; #endif #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else #if !defined(HEMV) && !defined(HEMVREV) *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #else *(y + i * COMPSIZE + 0) += CREAL(result) + *a * *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += CIMAG(result) + *a * *(x + i * COMPSIZE + 1); #endif #endif #endif a += lda * COMPSIZE; } return 0; }
static int gbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG lda, incx; BLASLONG n_from, n_to; BLASLONG i, offset_l, offset_u, uu, ll, ku, kl; #ifdef TRANSA #ifndef COMPLEX FLOAT result; #else OPENBLAS_COMPLEX_FLOAT result; #endif #endif a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; y = (FLOAT *)args -> c; lda = args -> lda; incx = args -> ldb; ku = args -> ldc; kl = args -> ldd; n_from = 0; n_to = args -> n; if (range_m) y += *range_m * COMPSIZE; if (range_n) { n_from = *(range_n + 0); n_to = *(range_n + 1); a += n_from * lda * COMPSIZE; } n_to = MIN(n_to, args -> m + ku); #ifdef TRANSA if (incx != 1) { COPY_K(args -> m, x, incx, buffer, 1); x = buffer; buffer += ((COMPSIZE * args -> m + 1023) & ~1023); } #endif SCAL_K( #ifndef TRANSA args -> m, #else args -> n, #endif 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y, 1, NULL, 0, NULL, 0); offset_u = ku - n_from; offset_l = ku - n_from + args -> m; #ifndef TRANSA x += n_from * incx * COMPSIZE; y -= offset_u * COMPSIZE; #else x -= offset_u * COMPSIZE; y += n_from * COMPSIZE; #endif for (i = n_from; i < n_to; i++) { uu = MAX(offset_u, 0); ll = MIN(offset_l, ku + kl + 1); #ifndef TRANSA MYAXPY(ll - uu, 0, 0, *(x + 0), #ifdef COMPLEX #ifndef XCONJ *(x + 1), #else -*(x + 1), #endif #endif a + uu * COMPSIZE, 1, y + uu * COMPSIZE, 1, NULL, 0); x += incx * COMPSIZE; #else result = MYDOT(ll - uu, a + uu * COMPSIZE, 1, x + uu * COMPSIZE, 1); #ifndef COMPLEX *y = result; #else *(y + 0) += CREAL(result); #ifndef XCONJ *(y + 1) += CIMAG(result); #else *(y + 1) -= CIMAG(result); #endif #endif x += COMPSIZE; #endif y += COMPSIZE; offset_u --; offset_l --; a += lda * COMPSIZE; } return 0; }
clblasStatus doHer2k( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus err; clblasUplo fUplo; clblasTranspose fTransA; cl_event firstHerkCall; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } if (numCommandQueues == 0 || commandQueues == NULL) { return clblasInvalidValue; } numCommandQueues = 1; if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } // Validate arguments if (retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET )) { return retCode; } if (transA == clblasTrans) { return clblasInvalidValue; } if (retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offa, lda, A_MAT_ERRSET )) { return retCode; } if (retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, B, offb, ldb, B_MAT_ERRSET )) { return retCode; } if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, C, offc, ldc, C_MAT_ERRSET )) { return retCode; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } fUplo = (order == clblasRowMajor) ? ((uplo == clblasLower) ? clblasUpper : clblasLower) : uplo; fTransA = (order == clblasRowMajor) ? ((transA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans) : transA; kargs->order = (order == clblasRowMajor) ? clblasColumnMajor : order; kargs->transA = fTransA; kargs->transB = (fTransA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans; kargs->uplo = fUplo; kargs->M = N; kargs->N = N; kargs->K = K; kargs->A = A; kargs->offA = offa; kargs->offa = offa; kargs->lda.matrix = lda; kargs->B = B; kargs->offBX = offb; kargs->ldb.matrix = ldb; kargs->C = C; kargs->offCY = offc; kargs->ldc.matrix = ldc; kargs->pigFuncID = CLBLAS_HERK; err = executeGEMM(kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, &firstHerkCall); if( err == CL_SUCCESS ) { kargs->A = B; kargs->offA = offb; kargs->offa = offb; kargs->lda.matrix = ldb; kargs->B = A; kargs->offBX = offa; kargs->ldb.matrix = lda; if( kargs->dtype == TYPE_COMPLEX_FLOAT ) { CIMAG( kargs->alpha.argFloatComplex ) *= -1.0; CREAL( kargs->beta.argFloatComplex ) = 1.0; CIMAG( kargs->beta.argFloatComplex ) = 0.0; } else { CIMAG( kargs->alpha.argDoubleComplex ) *= -1.0; CREAL( kargs->beta.argDoubleComplex ) = 1.0; CIMAG( kargs->beta.argDoubleComplex ) = 0.0; } err = executeGEMM(kargs, numCommandQueues, commandQueues, 1, &firstHerkCall, events); } return (clblasStatus)err; }
int main(int argc, char **argv) { int j, k, t; /**< indices */ int d; /**< number of dimensions */ int N; /**< number of source nodes */ int M; /**< number of target nodes */ int n; /**< expansion degree */ int m; /**< cut-off parameter */ int p; /**< degree of smoothness */ const char *s; /**< name of kernel */ C (*kernel)(R, int, const R *); /**< kernel function */ R c; /**< parameter for kernel */ fastsum_plan my_fastsum_plan; /**< plan for fast summation */ C *direct; /**< array for direct computation */ ticks t0, t1; /**< for time measurement */ R time; /**< for time measurement */ R error = K(0.0); /**< for error computation */ R eps_I; /**< inner boundary */ R eps_B; /**< outer boundary */ FILE *fid1, *fid2; R temp; if (argc != 11) { printf("\nfastsum_test d N M n m p kernel c\n\n"); printf(" d dimension \n"); printf(" N number of source nodes \n"); printf(" M number of target nodes \n"); printf(" n expansion degree \n"); printf(" m cut-off parameter \n"); printf(" p degree of smoothness \n"); printf(" kernel kernel function (e.g., gaussian)\n"); printf(" c kernel parameter \n"); printf(" eps_I inner boundary \n"); printf(" eps_B outer boundary \n\n"); exit(-1); } else { d = atoi(argv[1]); N = atoi(argv[2]); c = K(1.0) / POW((R)(N), K(1.0) / ((R)(d))); M = atoi(argv[3]); n = atoi(argv[4]); m = atoi(argv[5]); p = atoi(argv[6]); s = argv[7]; c = (R)(atof(argv[8])); eps_I = (R)(atof(argv[9])); eps_B = (R)(atof(argv[10])); if (strcmp(s, "gaussian") == 0) kernel = gaussian; else if (strcmp(s, "multiquadric") == 0) kernel = multiquadric; else if (strcmp(s, "inverse_multiquadric") == 0) kernel = inverse_multiquadric; else if (strcmp(s, "logarithm") == 0) kernel = logarithm; else if (strcmp(s, "thinplate_spline") == 0) kernel = thinplate_spline; else if (strcmp(s, "one_over_square") == 0) kernel = one_over_square; else if (strcmp(s, "one_over_modulus") == 0) kernel = one_over_modulus; else if (strcmp(s, "one_over_x") == 0) kernel = one_over_x; else if (strcmp(s, "inverse_multiquadric3") == 0) kernel = inverse_multiquadric3; else if (strcmp(s, "sinc_kernel") == 0) kernel = sinc_kernel; else if (strcmp(s, "cosc") == 0) kernel = cosc; else if (strcmp(s, "cot") == 0) kernel = kcot; else { s = "multiquadric"; kernel = multiquadric; } } printf( "d=%d, N=%d, M=%d, n=%d, m=%d, p=%d, kernel=%s, c=%" __FGS__ ", eps_I=%" __FGS__ ", eps_B=%" __FGS__ " \n", d, N, M, n, m, p, s, c, eps_I, eps_B); /** init two dimensional fastsum plan */ fastsum_init_guru(&my_fastsum_plan, d, N, M, kernel, &c, 0, n, m, p, eps_I, eps_B); /*fastsum_init_guru(&my_fastsum_plan, d, N, M, kernel, &c, EXACT_NEARFIELD, n, m, p);*/ /** load source knots and coefficients */ fid1 = fopen("x.dat", "r"); fid2 = fopen("alpha.dat", "r"); for (k = 0; k < N; k++) { for (t = 0; t < d; t++) { fscanf(fid1, __FR__, &my_fastsum_plan.x[k * d + t]); } fscanf(fid2, __FR__, &temp); my_fastsum_plan.alpha[k] = temp; fscanf(fid2, __FR__, &temp); my_fastsum_plan.alpha[k] += temp * II; } fclose(fid1); fclose(fid2); /** load target knots */ fid1 = fopen("y.dat", "r"); for (j = 0; j < M; j++) { for (t = 0; t < d; t++) { fscanf(fid1, __FR__, &my_fastsum_plan.y[j * d + t]); } } fclose(fid1); /** direct computation */ printf("direct computation: "); fflush(NULL); t0 = getticks(); fastsum_exact(&my_fastsum_plan); t1 = getticks(); time = NFFT(elapsed_seconds)(t1, t0); printf(__FI__ "sec\n", time); /** copy result */ direct = (C *) NFFT(malloc)((size_t)(my_fastsum_plan.M_total) * (sizeof(C))); for (j = 0; j < my_fastsum_plan.M_total; j++) direct[j] = my_fastsum_plan.f[j]; /** precomputation */ printf("pre-computation: "); fflush(NULL); t0 = getticks(); fastsum_precompute(&my_fastsum_plan); t1 = getticks(); time = NFFT(elapsed_seconds)(t1, t0); printf(__FI__ "sec\n", time); /** fast computation */ printf("fast computation: "); fflush(NULL); t0 = getticks(); fastsum_trafo(&my_fastsum_plan); t1 = getticks(); time = NFFT(elapsed_seconds)(t1, t0); printf(__FI__ "sec\n", time); /** compute max error */ error = K(0.0); for (j = 0; j < my_fastsum_plan.M_total; j++) { if (CABS(direct[j] - my_fastsum_plan.f[j]) / CABS(direct[j]) > error) error = CABS(direct[j] - my_fastsum_plan.f[j]) / CABS(direct[j]); } printf("max relative error: " __FE__ "\n", error); /** write result to file */ fid1 = fopen("f.dat", "w+"); fid2 = fopen("f_direct.dat", "w+"); if (fid1 == NULL) { printf("Fehler!\n"); exit(EXIT_FAILURE); } for (j = 0; j < M; j++) { temp = CREAL(my_fastsum_plan.f[j]); fprintf(fid1, " % .16" __FES__ "", temp); temp = CIMAG(my_fastsum_plan.f[j]); fprintf(fid1, " % .16" __FES__ "\n", temp); temp = CREAL(direct[j]); fprintf(fid2, " % .16" __FES__ "", temp); temp = CIMAG(direct[j]); fprintf(fid2, " % .16" __FES__ "\n", temp); } fclose(fid1); fclose(fid2); /** finalise the plan */ fastsum_finalize(&my_fastsum_plan); return EXIT_SUCCESS; }
void her2kCorrectnessTest(TestParams *params) { cl_int err; T *A, *B, *blasC, *clblasC; T alpha, beta; cl_mem bufA, bufC, bufB; clMath::BlasBase *base; cl_event *events; if (params->transA == clblasTrans) { ::std::cerr << ">> her2k(TRANSPOSE) for complex numbers " "is not allowed." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); A = new T[params->rowsA * params->columnsA]; B = new T[params->rowsB * params->columnsB]; blasC = new T[params->rowsC * params->columnsC]; clblasC = new T[params->rowsC * params->columnsC]; if((A == NULL) || (B == NULL) || (blasC == NULL) || (clblasC == NULL)) { deleteBuffers<T>(A, B, blasC, clblasC); ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; delete[] events; SUCCEED(); return; } srand(params->seed); alpha = convertMultiplier<T>(params->alpha); beta = convertMultiplier<T>(params->beta); ::std::cerr << "Generating input data... "; clblasTranspose ftransB = (params->transA==clblasNoTrans)? clblasConjTrans: clblasNoTrans; randomGemmMatrices<T>(params->order, params->transA, ftransB, params->N, params->N, params->K, true, &alpha, A, params->lda, B, params->ldb, true, &beta, blasC, params->ldc); memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC)); ::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), CL_MEM_READ_ONLY); bufB = base->createEnqueueBuffer(B, params->rowsB * params->columnsB * sizeof(*B), params->offBX * sizeof(*B), CL_MEM_READ_ONLY); bufC = base->createEnqueueBuffer(clblasC, params->rowsC * params->columnsC * sizeof(*clblasC), params->offCY * sizeof(*clblasC), CL_MEM_READ_WRITE); if ((bufA == NULL) || (bufB == NULL)|| (bufC == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufB, bufC); deleteBuffers<T>(A, B, blasC, clblasC); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling reference xHER2K routine... "; T fAlpha = alpha; if (params->order == clblasColumnMajor) { ::clMath::blas::her2k(clblasColumnMajor, params->uplo, params->transA, params->N, params->K, fAlpha, A, 0, params->lda, B, 0, params->ldb, CREAL(beta), blasC, 0, params->ldc); } else { CIMAG( fAlpha ) *= -1.0; // According to netlib C- interface clblasTranspose fTransA = (params->transA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans; clblasUplo fUplo = (params->uplo == clblasUpper) ? clblasLower : clblasUpper; ::clMath::blas::her2k(clblasColumnMajor, fUplo, fTransA, params->N, params->K, fAlpha, A, 0, params->lda, B, 0, params->ldb, CREAL(beta), blasC, 0, params->ldc); } ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling clblas xHER2K routine... "; err = (cl_int)::clMath::clblas::her2k(params->order, params->uplo, params->transA, params->N, params->K, alpha, bufA, params->offA, params->lda, bufB, params->offBX, params->ldb, CREAL(beta), bufC, params->offCY, params->ldc, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers<T>(A, B, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HER2K() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers<T>(A, B, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, params->offCY * sizeof(*clblasC), params->rowsC * params->columnsC * sizeof(*clblasC), clblasC, 0, NULL, NULL); releaseMemObjects(bufA, bufB, bufC); compareMatrices<T>(params->order, params->N, params->N, blasC, clblasC, params->ldc); deleteBuffers<T>(A, B, blasC, clblasC); delete[] events; }
int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ BLASLONG i, length; #ifndef LOWER BLASLONG offset; #endif FLOAT *X = x; FLOAT *Y = y; FLOAT *sbmvbuffer = (FLOAT *)buffer; FLOAT *bufferY = sbmvbuffer; FLOAT *bufferX = sbmvbuffer; if (incy != 1) { Y = bufferY; bufferX = (FLOAT *)(((BLASLONG)bufferY + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); sbmvbuffer = bufferX; COPY_K(n, y, incy, Y, 1); } if (incx != 1) { X = bufferX; sbmvbuffer = (FLOAT *)(((BLASLONG)bufferX + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); COPY_K(n, x, incx, X, 1); } #ifndef LOWER offset = k; #endif for (i = 0; i < n; i++) { #ifndef LOWER length = k - offset; AXPYU_K(length + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a + offset * COMPSIZE, 1, Y + (i - length) * COMPSIZE, 1, NULL, 0); if (length > 0) { FLOAT _Complex result = DOTU_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } if (offset > 0) offset --; #else length = k; if (n - i - 1 < k) length = n - i - 1; AXPYU_K(length + 1, 0, 0, alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], a, 1, Y + i * COMPSIZE, 1, NULL, 0); if (length > 0) { FLOAT _Complex result = DOTU_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); } #endif a += lda * 2; } if (incy != 1) { COPY_K(n, Y, 1, y, incy); } return 0; }
static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG incx; BLASLONG m_from, m_to, i; #ifndef COMPLEX FLOAT result; #else OPENBLAS_COMPLEX_FLOAT result; #endif a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; y = (FLOAT *)args -> c; incx = args -> ldb; m_from = 0; m_to = args -> m; if (range_m) { m_from = *(range_m + 0); m_to = *(range_m + 1); } if (range_n) y += *range_n * COMPSIZE; if (incx != 1) { #ifndef LOWER COPY_K(m_to, x, incx, buffer, 1); #else COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); #endif x = buffer; } #ifndef LOWER SCAL_K(m_to, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y, 1, NULL, 0, NULL, 0); #else SCAL_K(args -> m - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif #ifndef LOWER a += (m_from + 1) * m_from / 2 * COMPSIZE; #else a += (2 * args -> m - m_from - 1) * m_from / 2 * COMPSIZE; #endif for (i = m_from; i < m_to; i++) { #ifndef LOWER #if !defined(HEMV) && !defined(HEMVREV) result = MYDOT(i + 1, a, 1, x, 1); #else result = MYDOT(i , a, 1, x, 1); #endif #ifndef COMPLEX *(y + i * COMPSIZE) += result; #else #if !defined(HEMV) && !defined(HEMVREV) *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #else *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 1); #endif #endif MYAXPY(i, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a, 1, y, 1, NULL, 0); a += (i + 1) * COMPSIZE; #else #if !defined(HEMV) && !defined(HEMVREV) result = MYDOT(args -> m - i , a + i * COMPSIZE, 1, x + i * COMPSIZE, 1); #else result = MYDOT(args -> m - i - 1, a + (i + 1) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1); #endif #ifndef COMPLEX *(y + i * COMPSIZE) += result; #else #if !defined(HEMV) && !defined(HEMVREV) *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #else *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 1); #endif #endif MYAXPY(args -> m - i - 1, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + (i + 1) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); a += (args -> m - i - 1) * COMPSIZE; #endif } return 0; }
int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i, is, min_i; #if (TRANSA == 2) || (TRANSA == 4) FLOAT _Complex result; #endif #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; #endif FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } for (is = m; is > 0; is -= DTB_ENTRIES){ min_i = MIN(is, DTB_ENTRIES); #if (TRANSA == 2) || (TRANSA == 4) if (m - is > 0){ #if TRANSA == 2 GEMV_T(m - is, min_i, 0, dm1, ZERO, a + (is + (is - min_i) * lda) * COMPSIZE, lda, B + is * COMPSIZE, 1, B + (is - min_i) * COMPSIZE, 1, gemvbuffer); #else GEMV_C(m - is, min_i, 0, dm1, ZERO, a + (is + (is - min_i) * lda) * COMPSIZE, lda, B + is * COMPSIZE, 1, B + (is - min_i) * COMPSIZE, 1, gemvbuffer); #endif } #endif for (i = 0; i < min_i; i++) { FLOAT *AA = a + ((is - i - 1) + (is - i - 1) * lda) * COMPSIZE; FLOAT *BB = B + (is - i - 1) * COMPSIZE; #if (TRANSA == 2) || (TRANSA == 4) if (i > 0) { #if TRANSA == 2 result = DOTU_K(i, AA + 2, 1, BB + 2, 1); #else result = DOTC_K(i, AA + 2, 1, BB + 2, 1); #endif BB[0] -= CREAL(result); BB[1] -= CIMAG(result); } #endif #ifndef UNIT ar = AA[0]; ai = AA[1]; if (fabs(ar) >= fabs(ai)){ ratio = ai / ar; den = 1./(ar * ( 1 + ratio * ratio)); ar = den; #if TRANSA < 3 ai = -ratio * den; #else ai = ratio * den; #endif } else { ratio = ar / ai; den = 1./(ai * ( 1 + ratio * ratio)); ar = ratio * den; #if TRANSA < 3 ai = -den; #else ai = den; #endif } br = BB[0]; bi = BB[1]; BB[0] = ar*br - ai*bi; BB[1] = ar*bi + ai*br; #endif #if (TRANSA == 1) || (TRANSA == 3) if (i < min_i - 1) { #if TRANSA == 1 AXPYU_K (min_i - i - 1, 0, 0, - BB[0], -BB[1], AA - (min_i - i - 1) * COMPSIZE, 1, BB - (min_i - i - 1) * COMPSIZE, 1, NULL, 0); #else AXPYC_K(min_i - i - 1, 0, 0, - BB[0], -BB[1], AA - (min_i - i - 1) * COMPSIZE, 1, BB - (min_i - i - 1) * COMPSIZE, 1, NULL, 0); #endif } #endif } #if (TRANSA == 1) || (TRANSA == 3) if (is - min_i > 0){ #if TRANSA == 1 GEMV_N(is - min_i, min_i, 0, dm1, ZERO, a + (is - min_i) * lda * COMPSIZE, lda, B + (is - min_i) * COMPSIZE, 1, B, 1, gemvbuffer); #else GEMV_R(is - min_i, min_i, 0, dm1, ZERO, a + (is - min_i) * lda * COMPSIZE, lda, B + (is - min_i) * COMPSIZE, 1, B, 1, gemvbuffer); #endif } #endif } if (incb != 1) { COPY_K(m, buffer, 1, b, incb); } return 0; }
static int tpmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG incx; BLASLONG m_from, m_to; BLASLONG i; #ifdef TRANS #ifndef COMPLEX FLOAT result; #else OPENBLAS_COMPLEX_FLOAT result; #endif #endif #if defined(COMPLEX) && !defined(UNIT) FLOAT ar, ai, xr, xi; #endif a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; y = (FLOAT *)args -> c; incx = args -> ldb; m_from = 0; m_to = args -> m; if (range_m) { m_from = *(range_m + 0); m_to = *(range_m + 1); } if (incx != 1) { #ifndef LOWER COPY_K(m_to, x, incx, buffer, 1); #else COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); #endif x = buffer; buffer += ((COMPSIZE * args -> m + 1023) & ~1023); } #ifndef TRANS if (range_n) y += *range_n * COMPSIZE; #ifndef LOWER SCAL_K(m_to, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y, 1, NULL, 0, NULL, 0); #else SCAL_K(args -> m - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif #else SCAL_K(m_to - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif #ifndef LOWER a += (m_from + 1) * m_from / 2 * COMPSIZE; #else a += (2 * args -> m - m_from - 1) * m_from / 2 * COMPSIZE; #endif for (i = m_from; i < m_to; i++) { #ifndef LOWER if (i > 0) { #ifndef TRANS MYAXPY(i, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a, 1, y, 1, NULL, 0); #else result = MYDOT(i, a, 1, x, 1); #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #endif #endif } #endif #ifndef COMPLEX #ifdef UNIT *(y + i * COMPSIZE) += *(x + i * COMPSIZE); #else *(y + i * COMPSIZE) += *(a + i * COMPSIZE) * *(x + i * COMPSIZE); #endif #else #ifdef UNIT *(y + i * COMPSIZE + 0) += *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += *(x + i * COMPSIZE + 1); #else ar = *(a + i * COMPSIZE + 0); ai = *(a + i * COMPSIZE + 1); xr = *(x + i * COMPSIZE + 0); xi = *(x + i * COMPSIZE + 1); #if (TRANSA == 1) || (TRANSA == 2) *(y + i * COMPSIZE + 0) += ar * xr - ai * xi; *(y + i * COMPSIZE + 1) += ar * xi + ai * xr; #else *(y + i * COMPSIZE + 0) += ar * xr + ai * xi; *(y + i * COMPSIZE + 1) += ar * xi - ai * xr; #endif #endif #endif #ifdef LOWER if (args -> m > i + 1) { #ifndef TRANS MYAXPY(args -> m - i - 1, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + (i + 1 ) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); #else result = MYDOT(args -> m - i - 1, a + (i + 1) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1); #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #endif #endif } #endif #ifndef LOWER a += (i + 1) * COMPSIZE; #else a += (args -> m - i - 1) * COMPSIZE; #endif } return 0; }
void CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ BLASLONG i, offset_u, offset_l, start, end, length; FLOAT *X = x; FLOAT *Y = y; FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *bufferY = gemvbuffer; FLOAT *bufferX = gemvbuffer; #ifdef TRANS FLOAT _Complex temp; #endif if (incy != 1) { Y = bufferY; bufferX = (FLOAT *)(((BLASLONG)bufferY + M * sizeof(FLOAT) * 2 + 4095) & ~4095); gemvbuffer = bufferX; COPY_K(M, y, incy, Y, 1); } if (incx != 1) { X = bufferX; gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + N * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(N, x, incx, X, 1); } offset_u = ku; offset_l = ku + m; for (i = 0; i < MIN(n, m + ku); i++) { start = MAX(offset_u, 0); end = MIN(offset_l, ku + kl + 1); length = end - start; #ifndef TRANS ZAXPY(length, 0, 0, #ifndef XCONJ alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], #else alpha_r * X[i * 2 + 0] + alpha_i * X[i * 2 + 1], alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1], #endif a + start * 2, 1, Y + (start - offset_u) * 2, 1, NULL, 0); #else #ifndef XCONJ temp = ZDOT(length, a + start * 2, 1, X + (start - offset_u) * 2, 1); #else temp = ZDOT(length, X + (start - offset_u) * 2, 1, a + start * 2, 1); #endif #if !defined(XCONJ) || !defined(CONJ) Y[i * 2 + 0] += alpha_r * CREAL(temp) - alpha_i * CIMAG(temp); Y[i * 2 + 1] += alpha_i * CREAL(temp) + alpha_r * CIMAG(temp); #else Y[i * 2 + 0] += alpha_r * CREAL(temp) + alpha_i * CIMAG(temp); Y[i * 2 + 1] += alpha_i * CREAL(temp) - alpha_r * CIMAG(temp); #endif #endif offset_u --; offset_l --; a += lda * 2; } if (incy != 1) { COPY_K(M, Y, 1, y, incy); } return; }
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) OPENBLAS_COMPLEX_FLOAT temp; #endif #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; #endif if (incb != 1) { B = buffer; gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } for (i = 0; i < n; i++) { #if (TRANSA == 1) || (TRANSA == 3) length = i; if (length > k) length = k; if (length > 0) { #if TRANSA == 1 AXPYU_K(length, 0, 0, B[i * 2 + 0], B[i * 2 + 1], a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1, NULL, 0); #else AXPYC_K(length, 0, 0, B[i * 2 + 0], B[i * 2 + 1], a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1, NULL, 0); #endif } #endif #ifndef UNIT #if (TRANSA == 1) || (TRANSA == 3) atemp1 = a[k * 2 + 0]; atemp2 = a[k * 2 + 1]; #else atemp1 = a[0]; atemp2 = a[1]; #endif btemp1 = B[i * 2 + 0]; btemp2 = B[i * 2 + 1]; #if (TRANSA == 1) || (TRANSA == 2) B[i * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2; B[i * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1; #else B[i * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2; B[i * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1; #endif #endif #if (TRANSA == 2) || (TRANSA == 4) length = n - i - 1; if (length > k) length = k; if (length > 0) { #if TRANSA == 2 temp = DOTU_K(length, a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1); #else temp = DOTC_K(length, a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1); #endif B[i * 2 + 0] += CREAL(temp); B[i * 2 + 1] += CIMAG(temp); } #endif a += lda * COMPSIZE; } if (incb != 1) { COPY_K(n, buffer, 1, b, incb); } return 0; }
static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG lda, incx; BLASLONG m_from, m_to; BLASLONG i, is, min_i; #ifdef TRANS #ifndef COMPLEX FLOAT result; #else OPENBLAS_COMPLEX_FLOAT result; #endif #endif #if defined(COMPLEX) && !defined(UNIT) FLOAT ar, ai, xr, xi; #endif a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; y = (FLOAT *)args -> c; lda = args -> lda; incx = args -> ldb; m_from = 0; m_to = args -> m; if (range_m) { m_from = *(range_m + 0); m_to = *(range_m + 1); } if (incx != 1) { #ifndef LOWER COPY_K(m_to, x, incx, buffer, 1); #else COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); #endif x = buffer; buffer += ((COMPSIZE * args -> m + 3) & ~3); } #ifndef TRANS if (range_n) y += *range_n * COMPSIZE; #ifndef LOWER SCAL_K(m_to, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y, 1, NULL, 0, NULL, 0); #else SCAL_K(args -> m - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif #else SCAL_K(m_to - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif for (is = m_from; is < m_to; is += DTB_ENTRIES){ min_i = MIN(m_to - is, DTB_ENTRIES); #ifndef LOWER if (is > 0){ MYGEMV(is, min_i, 0, ONE, #ifdef COMPLEX ZERO, #endif a + is * lda * COMPSIZE, lda, #ifndef TRANS x + is * COMPSIZE, 1, y, 1, #else x, 1, y + is * COMPSIZE, 1, #endif buffer); } #endif for (i = is; i < is + min_i; i++) { #ifndef LOWER if (i - is > 0) { #ifndef TRANS MYAXPY(i - is, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + (is + i * lda) * COMPSIZE, 1, y + is * COMPSIZE, 1, NULL, 0); #else result = MYDOT(i - is, a + (is + i * lda) * COMPSIZE, 1, x + is * COMPSIZE, 1); #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #endif #endif } #endif #ifndef COMPLEX #ifdef UNIT *(y + i * COMPSIZE) += *(x + i * COMPSIZE); #else *(y + i * COMPSIZE) += *(a + (i + i * lda) * COMPSIZE) * *(x + i * COMPSIZE); #endif #else #ifdef UNIT *(y + i * COMPSIZE + 0) += *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += *(x + i * COMPSIZE + 1); #else ar = *(a + (i + i * lda) * COMPSIZE + 0); ai = *(a + (i + i * lda) * COMPSIZE + 1); xr = *(x + i * COMPSIZE + 0); xi = *(x + i * COMPSIZE + 1); #if (TRANSA == 1) || (TRANSA == 2) *(y + i * COMPSIZE + 0) += ar * xr - ai * xi; *(y + i * COMPSIZE + 1) += ar * xi + ai * xr; #else *(y + i * COMPSIZE + 0) += ar * xr + ai * xi; *(y + i * COMPSIZE + 1) += ar * xi - ai * xr; #endif #endif #endif #ifdef LOWER if (is + min_i > i + 1) { #ifndef TRANS MYAXPY(is + min_i - i - 1, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + (i + 1 + i * lda) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); #else result = MYDOT(is + min_i - i - 1, a + (i + 1 + i * lda) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1); #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #endif #endif } #endif } #ifdef LOWER if (args -> m > is + min_i){ MYGEMV(args -> m - is - min_i, min_i, 0, ONE, #ifdef COMPLEX ZERO, #endif a + (is + min_i + is * lda) * COMPSIZE, lda, #ifndef TRANS x + is * COMPSIZE, 1, y + (is + min_i) * COMPSIZE, 1, #else x + (is + min_i) * COMPSIZE, 1, y + is * COMPSIZE, 1, #endif buffer); } #endif } return 0; }