Exemplo n.º 1
0
static void
assignKargs(KernelArg *args, const void *params, const void*)
{
    CLBlasKargs *blasArgs = (CLBlasKargs*)params;

#ifdef DEBUG_SYMM
    printf("SAlpha=%f, DAlpha=%f, CAlpha =<%f, %f>, DAlpha=<%f, %f>\n",
           blasArgs->alpha.argFloat, blasArgs->alpha.argDouble, CREAL(blasArgs->alpha.argFloatComplex), CIMAG(blasArgs->alpha.argFloatComplex),
           CREAL(blasArgs->alpha.argDoubleComplex) , CIMAG(blasArgs->alpha.argDoubleComplex));
    printf("SBeta=%f, DBeta=%f, CBeta=<%f, %f>, DBeta=<%f, %f>\n",
           blasArgs->beta.argFloat, blasArgs->beta.argDouble, CREAL(blasArgs->beta.argFloatComplex), CIMAG(blasArgs->beta.argFloatComplex),
           CREAL(blasArgs->beta.argDoubleComplex) , CIMAG(blasArgs->beta.argDoubleComplex));
#endif

    INIT_KARG(&args[0], blasArgs->A);   //A - input matrix - argument
    INIT_KARG(&args[1], blasArgs->B);
    INIT_KARG(&args[2], blasArgs->C);
    initSizeKarg(&args[3], blasArgs->M);
    initSizeKarg(&args[4], blasArgs->N);
    initSizeKarg(&args[5], blasArgs->lda.matrix);
    initSizeKarg(&args[6], blasArgs->ldb.matrix);
    initSizeKarg(&args[7], blasArgs->ldc.matrix);
    initSizeKarg(&args[8], blasArgs->offa); //PENDING: offA or offa ??
    initSizeKarg(&args[9], blasArgs->offBX);
    initSizeKarg(&args[10], blasArgs->offCY);
    assignScalarKarg(&args[11], &(blasArgs->alpha), blasArgs->dtype);
    assignScalarKarg(&args[12], &(blasArgs->beta), blasArgs->dtype);
    return;
}
Exemplo n.º 2
0
//xHEMM
void
printTestParams(
    clblasOrder order,
    clblasSide side,
    clblasUplo uplo,
    size_t M,
    size_t N,
    bool useAlpha,
    cl_float2 alpha,
    bool useBeta,
    cl_float2 beta,
    size_t lda,
    size_t ldb,
    size_t ldc,
    size_t offa,
    size_t offb,
    size_t offc )
{
    ::std::cerr << orderStr(order) << ", " << sideStr(side) << ", " << uploStr(uplo) << ::std::endl;
    ::std::cerr << "M = " << M << ", N = " << N << ::std::endl;
    ::std::cerr << "lda = " << lda << ", ldb = " << ldb << ", ldc = " << ldc<< ::std::endl;
    if (useAlpha) {
        ::std::cerr << "alpha = (" << CREAL(alpha) << "," << CIMAG(alpha) << ")" << ::std::endl; }
         if (useBeta) {
        ::std::cerr << "beta = (" << CREAL(beta) << "," << CIMAG(beta)  << ")" << ::std::endl; }
        ::std::cerr << "offa = " << offa << ", offb = " << offb << ", offc = " << offc<< ::std::endl;

}
Exemplo n.º 3
0
clblasStatus
clblasZdscal(
    size_t N,
    double alpha,
    cl_mem X,
    size_t offx,
    int incx,
    cl_uint numCommandQueues,
    cl_command_queue *commandQueues,
    cl_uint numEventsInWaitList,
    const cl_event *eventWaitList,
    cl_event *events)
    {
         CLBlasKargs kargs;
        DoubleComplex fAlpha;

        #ifdef DEBUG_SSCAL
        printf("\nZDSCAL Called\n");
        #endif

        CREAL(fAlpha) = alpha;
        CIMAG(fAlpha) = 0.0f;

        memset(&kargs, 0, sizeof(kargs));
        kargs.alpha.argDoubleComplex = fAlpha;
        kargs.dtype = TYPE_COMPLEX_DOUBLE;

        return doScal(&kargs, N, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);
    }
Exemplo n.º 4
0
clblasStatus
clblasZher2k(
    clblasOrder order,
    clblasUplo uplo,
    clblasTranspose trans,
    size_t N,
    size_t K,
    DoubleComplex alpha,
    const cl_mem A,
    size_t offa,
    size_t lda,
    const cl_mem B,
    size_t offb,
    size_t ldb,
    cl_double beta,
    cl_mem C,
    size_t offc,
    size_t ldc,
    cl_uint numCommandQueues,
    cl_command_queue *commandQueues,
    cl_uint numEventsInWaitList,
    const cl_event *eventWaitList,
    cl_event *events)
{
    CLBlasKargs kargs;
    DoubleComplex fBeta;

    memset(&kargs, 0, sizeof(kargs));

    CREAL(fBeta)  = beta;
    CIMAG(fBeta)  = 0.0f;

    kargs.alpha.argDoubleComplex = alpha;
    kargs.beta.argDoubleComplex = fBeta;

    kargs.dtype = TYPE_COMPLEX_DOUBLE;

    if( order == clblasRowMajor )
    {
        CIMAG( kargs.alpha.argDoubleComplex ) *= -1.0;
    }

    return doHer2k(&kargs, order, uplo, trans, N, K, A, offa, lda, B, offb, ldb,
                    C, offc, ldc, numCommandQueues, commandQueues,
                    numEventsInWaitList, eventWaitList, events);
}
Exemplo n.º 5
0
static void
printResult(void)
{
    size_t i;
    printf("\nResult:\n");

    for (i = 0; i < N; i++) {
		printf("(%f, %-f) \n", CREAL(X[i]), CIMAG(X[i]));
    }
}
Exemplo n.º 6
0
clblasStatus
clblasZtbsv(
    clblasOrder order,
    clblasUplo uplo,
    clblasTranspose trans,
    clblasDiag diag,
    size_t N,
    size_t K,
    const cl_mem A,
    size_t offa,
    size_t lda,
    cl_mem X,
    size_t offx,
    int incx,
    cl_uint numCommandQueues,
    cl_command_queue *commandQueues,
    cl_uint numEventsInWaitList,
    const cl_event *eventWaitList,
    cl_event *events)
{
    CLBlasKargs kargs;
    DoubleComplex alpha, beta;
    #ifdef DEBUG_TBSV
    printf("ZTBSV Called\n");
    #endif

    memset(&kargs, 0, sizeof(kargs));
    kargs.dtype = TYPE_COMPLEX_DOUBLE;
    kargs.pigFuncID = CLBLAS_TBSV;

    CREAL(alpha) = -1.0;
    CIMAG(alpha) = 0.0;
    CREAL(beta) = 1.0;
    CIMAG(beta) = 0.0;

    kargs.alpha.argDoubleComplex = alpha;
    kargs.beta.argDoubleComplex = beta;

    return doTbsv(&kargs, order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues,
                   numEventsInWaitList, eventWaitList, events);
}
Exemplo n.º 7
0
static void
printResult(void)
{
    size_t i, j;
    printf("\nResult:\n");

    for (i = 0; i < N; i++) {
		for(j = 0; j < N; j++)
			printf("(%9.2lf, %-9.2lf)\t", CREAL( A[ i*N + j ] ), CIMAG( A[ i*N + j ] ));
		printf("\n");
    }
}
Exemplo n.º 8
0
//https://github.com/xianyi/OpenBLAS/issues/695
CTEST(potrf, bug_695){

  openblas_complex_float A1[100] = {5.8525753+0.0*I, -0.79540455-0.7066077*I, 0.98274714-1.3824869*I, 2.619998-1.8532984*I, -1.8306153+1.2336911*I, 0.32275113-0.015575029*I, 2.1968813-1.0640624*I, 0.27894387-0.97911835*I, 3.0476584-0.18548489*I, 0.3842994-0.7050991*I,
			   -0.79540455+0.7066077*I, 8.313246+0.0*I, -1.8076122+0.8882447*I, 0.47806996-0.48494184*I, 0.5096429+0.5395974*I, -0.7285097+0.10360408*I, -1.1760061+2.7146957*I, -0.4271084-0.042899966*I, -1.7228563-2.8335886*I, 1.8942566-0.6389735*I,
			   0.98274714+1.3824869*I, -1.8076122-0.8882447*I, 9.367975+0.0*I, -0.1838578-0.6468568*I, -1.8338387-0.7064959*I, 0.041852742+0.6556877*I, 2.5673025-1.9732997*I, -1.1148382+0.15693812*I, 2.4704504+1.0389464*I, 1.0858271+1.298006*I,
			   2.619998+1.8532984*I, 0.47806996+0.48494184*I, -0.1838578+0.6468568*I, 3.1117508+0.0*I, -1.956626-0.22825956*I, 0.07081801+0.31801307*I, 0.3698375+0.5400855*I, 0.80686307-1.5315914*I, 1.5649154+1.6229297*I, -0.112077385-1.2014246*I,
			   -1.8306153-1.2336911*I, 0.5096429-0.5395974*I, -1.8338387+0.7064959*I, -1.956626+0.22825956*I, 3.6439795+0.0*I, -0.2594722-0.48786148*I, -0.47636223+0.27821827*I, -0.61608654+2.01858*I, -2.7767487-1.7693765*I, 0.048102796+0.9741874*I,
			   0.32275113+0.015575029*I, -0.7285097-0.10360408*I, 0.041852742-0.6556877*I, 0.07081801-0.31801307*I, -0.2594722+0.48786148*I, 3.624376+0.0*I, -1.6697118-0.4017511*I, -1.4397877+0.7550918*I, -0.31456697+1.0403451*I, -0.31978557-0.13701046*I,
			   2.1968813+1.0640624*I, -1.1760061-2.7146957*I, 2.5673025+1.9732997*I, 0.3698375-0.5400855*I, -0.47636223-0.27821827*I, -1.6697118+0.4017511*I, 6.8273163+0.0*I, -0.10051322-0.24303961*I, 1.4415971-0.29750675*I, 1.221786+0.85654986*I,
			   0.27894387+0.97911835*I, -0.4271084+0.042899966*I, -1.1148382-0.15693812*I, 0.80686307+1.5315914*I, -0.61608654-2.01858*I, -1.4397877-0.7550918*I, -0.10051322+0.24303961*I, 3.4057708+0.0*I, -0.5856801+1.0203559*I, 0.7103452-0.8422135*I,
			   3.0476584+0.18548489*I, -1.7228563+2.8335886*I, 2.4704504-1.0389464*I, 1.5649154-1.6229297*I, -2.7767487+1.7693765*I, -0.31456697-1.0403451*I, 1.4415971+0.29750675*I, -0.5856801-1.0203559*I, 7.005772+0.0*I, -0.9617417+1.2486815*I,
			   0.3842994+0.7050991*I, 1.8942566+0.6389735*I, 1.0858271-1.298006*I, -0.112077385+1.2014246*I, 0.048102796-0.9741874*I, -0.31978557+0.13701046*I, 1.221786-0.85654986*I, 0.7103452+0.8422135*I, -0.9617417-1.2486815*I, 3.4629636+0.0*I};
  char up = 'U';

  blasint n=10;
  blasint info[1];
  BLASFUNC(cpotrf)(&up, &n, (float*)(A1), &n, info);
  //printf("%g+%g*I\n", creal(A1[91]), cimag(A1[91]));

  openblas_complex_double A2[100] = {3.0607147216796875+0.0*I, -0.5905849933624268-0.29020825028419495*I, 0.321084201335907+0.45168760418891907*I, 0.8387917876243591-0.644718587398529*I, -0.3642411530017853+0.051274992525577545*I, 0.8071482181549072+0.33944568037986755*I, 0.013674172572791576+0.21422699093818665*I, 0.35476258397102356+0.42408594489097595*I, -0.5991537570953369-0.23082709312438965*I, -0.0600702166557312-0.2113417387008667*I,
			    -0.7954045534133911+0.7066076993942261*I, 2.807175397872925+0.0*I, -0.1691000759601593+0.313548743724823*I, -0.30911174416542053+0.7447023987770081*I, -0.22347848117351532+0.03316075727343559*I, -0.4088296890258789-1.0214389562606812*I, -0.2344931811094284+0.08056317269802094*I, 0.793269693851471-0.17507623136043549*I, 0.03163455054163933+0.20559945702552795*I, 0.13581633567810059-0.2110036462545395*I,
			    0.9827471375465393+1.3824869394302368*I, -1.8076121807098389-0.8882446885108948*I, 2.3277781009674072+0.0*I, 0.830405056476593-0.19296252727508545*I, 0.1394239068031311-0.5260677933692932*I, 1.239942193031311-0.09915469586849213*I, 0.06731037050485611-0.059320636093616486*I, 0.11507681757211685-0.1984301060438156*I, -0.6843825578689575+0.4647614359855652*I, 1.213119387626648-0.7757048010826111*I,
			    2.619997978210449+1.8532984256744385*I, 0.4780699610710144+0.48494184017181396*I, -0.18385779857635498+0.6468567848205566*I, 2.0811400413513184+0.0*I, -0.035075582563877106+0.09732913225889206*I, 0.27337002754211426-0.9032229781150818*I, -0.8374675512313843+0.0479498989880085*I, 0.6916252374649048+0.45711082220077515*I, 0.1883818507194519+0.06482727080583572*I, -0.32384994626045227+0.05857187137007713*I,
			    -1.8306152820587158-1.2336910963058472*I, 0.5096428990364075-0.5395973920822144*I, -1.833838701248169+0.7064958810806274*I, -1.956626057624817+0.22825956344604492*I, 1.706615924835205+0.0*I, -0.2895336151123047+0.17579378187656403*I, -0.923172116279602-0.4530014097690582*I, 0.5040621757507324-0.37026339769363403*I, -0.2824432849884033-1.0374568700790405*I, 0.1399831622838974+0.4977008104324341*I,
			    0.32275113463401794+0.015575028955936432*I, -0.7285097241401672-0.10360407829284668*I, 0.041852742433547974-0.655687689781189*I, 0.07081800699234009-0.318013072013855*I, -0.25947219133377075+0.4878614842891693*I, 1.5735365152359009+0.0*I, -0.2647853195667267-0.26654252409935*I, -0.6190430521965027-0.24699924886226654*I, -0.6288471221923828+0.48154571652412415*I, 0.02446540631353855-0.2611822783946991*I,
			    2.1968812942504883+1.0640623569488525*I, -1.1760060787200928-2.714695692062378*I, 2.5673024654388428+1.9732997417449951*I, 0.3698374927043915-0.54008549451828*I, -0.4763622283935547-0.27821826934814453*I, -1.6697118282318115+0.4017511010169983*I, 1.2674795389175415+0.0*I, 0.3079095482826233-0.07258892804384232*I, -0.5929520130157471-0.038360968232154846*I, 0.04388086497783661-0.025549031794071198*I,
			    0.27894386649131775+0.9791183471679688*I, -0.42710840702056885+0.0428999662399292*I, -1.1148382425308228-0.1569381207227707*I, 0.8068630695343018+1.5315914154052734*I, -0.6160865426063538-2.0185799598693848*I, -1.439787745475769-0.7550917863845825*I, -0.10051321983337402+0.24303960800170898*I, 0.9066106081008911+0.0*I, 0.05315789580345154-0.06136537343263626*I, -0.21304509043693542+0.6494344472885132*I,
			    3.0476584434509277+0.1854848861694336*I, -1.7228562831878662+2.8335886001586914*I, 2.4704504013061523-1.0389463901519775*I, 1.564915418624878-1.6229296922683716*I, -2.7767486572265625+1.769376516342163*I, -0.314566969871521-1.0403450727462769*I, 1.4415971040725708+0.29750674962997437*I, -0.5856801271438599-1.0203559398651123*I, 0.5668219923973083+0.0*I, 0.033351436257362366-0.07832501083612442*I,
			    0.3842993974685669+0.7050991058349609*I, 1.894256591796875+0.6389734745025635*I, 1.085827112197876-1.2980060577392578*I, -0.11207738518714905+1.2014245986938477*I, 0.04810279607772827-0.9741873741149902*I, -0.31978556513786316+0.13701045513153076*I, 1.2217860221862793-0.856549859046936*I, 0.7103452086448669+0.84221351146698*I, -0.9617416858673096-1.2486815452575684*I, 0.0756804421544075+0.0*I};
  openblas_complex_double B[20] = {-0.21782716937787788-0.9222220085490986*I, -0.7620356655676837+0.15533508334193666*I, -0.905011814118756+0.2847570854574069*I, -0.3451346708401685+1.076948486041297*I, 0.25336108035924787+0.975317836492159*I, 0.11192755545114-0.1603741874112385*I, -0.20604111555491242+0.10570814584017311*I, -1.0568488936791578-0.06025820467086475*I, -0.6650468984506477-0.5000967284800251*I, -1.0509472322215125+0.5022165705328413*I,
			  -0.727775859267237+0.50638268521728*I, 0.39947219167701153-0.4576746001199889*I, -0.7122162951294634-0.630289556702497*I, 0.9870834574024372-0.2825689605519449*I, 0.0628393808469436-0.1253397353973715*I, 0.8439562576196216+1.0850814110398734*I, 0.562377322638969-0.2578030745663871*I, 0.12696236014017806-0.09853584666755086*I, -0.023682508769195098+0.18093440285319276*I, -0.7264975746431271+0.31670415674097235*I};
  char lo = 'L';
  blasint nrhs = 2;
  BLASFUNC(zpotrs)(&lo, &n, &nrhs, (double*)(A2), &n, (double*)(B), &n, info);

  // note that this is exactly equal to A1
  openblas_complex_float A3[100] = {5.8525753+0.0*I, -0.79540455-0.7066077*I, 0.98274714-1.3824869*I, 2.619998-1.8532984*I, -1.8306153+1.2336911*I, 0.32275113-0.015575029*I, 2.1968813-1.0640624*I, 0.27894387-0.97911835*I, 3.0476584-0.18548489*I, 0.3842994-0.7050991*I,
			   -0.79540455+0.7066077*I, 8.313246+0.0*I, -1.8076122+0.8882447*I, 0.47806996-0.48494184*I, 0.5096429+0.5395974*I, -0.7285097+0.10360408*I, -1.1760061+2.7146957*I, -0.4271084-0.042899966*I, -1.7228563-2.8335886*I, 1.8942566-0.6389735*I,
			   0.98274714+1.3824869*I, -1.8076122-0.8882447*I, 9.367975+0.0*I, -0.1838578-0.6468568*I, -1.8338387-0.7064959*I, 0.041852742+0.6556877*I, 2.5673025-1.9732997*I, -1.1148382+0.15693812*I, 2.4704504+1.0389464*I, 1.0858271+1.298006*I,
			   2.619998+1.8532984*I, 0.47806996+0.48494184*I, -0.1838578+0.6468568*I, 3.1117508+0.0*I, -1.956626-0.22825956*I, 0.07081801+0.31801307*I, 0.3698375+0.5400855*I, 0.80686307-1.5315914*I, 1.5649154+1.6229297*I, -0.112077385-1.2014246*I,
			   -1.8306153-1.2336911*I, 0.5096429-0.5395974*I, -1.8338387+0.7064959*I, -1.956626+0.22825956*I, 3.6439795+0.0*I, -0.2594722-0.48786148*I, -0.47636223+0.27821827*I, -0.61608654+2.01858*I, -2.7767487-1.7693765*I, 0.048102796+0.9741874*I,
			   0.32275113+0.015575029*I, -0.7285097-0.10360408*I, 0.041852742-0.6556877*I, 0.07081801-0.31801307*I, -0.2594722+0.48786148*I, 3.624376+0.0*I, -1.6697118-0.4017511*I, -1.4397877+0.7550918*I, -0.31456697+1.0403451*I, -0.31978557-0.13701046*I,
			   2.1968813+1.0640624*I, -1.1760061-2.7146957*I, 2.5673025+1.9732997*I, 0.3698375-0.5400855*I, -0.47636223-0.27821827*I, -1.6697118+0.4017511*I, 6.8273163+0.0*I, -0.10051322-0.24303961*I, 1.4415971-0.29750675*I, 1.221786+0.85654986*I,
			   0.27894387+0.97911835*I, -0.4271084+0.042899966*I, -1.1148382-0.15693812*I, 0.80686307+1.5315914*I, -0.61608654-2.01858*I, -1.4397877-0.7550918*I, -0.10051322+0.24303961*I, 3.4057708+0.0*I, -0.5856801+1.0203559*I, 0.7103452-0.8422135*I,
			   3.0476584+0.18548489*I, -1.7228563+2.8335886*I, 2.4704504-1.0389464*I, 1.5649154-1.6229297*I, -2.7767487+1.7693765*I, -0.31456697-1.0403451*I, 1.4415971+0.29750675*I, -0.5856801-1.0203559*I, 7.005772+0.0*I, -0.9617417+1.2486815*I,
			   0.3842994+0.7050991*I, 1.8942566+0.6389735*I, 1.0858271-1.298006*I, -0.112077385+1.2014246*I, 0.048102796-0.9741874*I, -0.31978557+0.13701046*I, 1.221786-0.85654986*I, 0.7103452+0.8422135*I, -0.9617417-1.2486815*I, 3.4629636+0.0*I};

  BLASFUNC(cpotrf)(&up, &n, (float*)(A3), &n, info);
  //  printf("%g+%g*I\n", creal(A3[91]), cimag(A3[91]));
  if(isnan(CREAL(A3[91])) || isnan(CIMAG(A3[91]))) {
    CTEST_ERR("%s:%d  got NaN", __FILE__, __LINE__);
  }
}
Exemplo n.º 9
0
static void
printResult(void)
{
    size_t i, nElements;

    printf("Result:\n");

    nElements = (sizeof(Y) / sizeof(cl_double2)) / incy;
    for (i = 0; i < nElements; i++) {
        printf("(%9.2f, %-9.2f)\n", CREAL(Y[i * incy]), CIMAG(Y[i * incy]));
    }
}
Exemplo n.º 10
0
static void
printResult(void)
{
    size_t i, j;

    printf("Result:\n");

    for (i = 0; i < N; i++) {
        for (j = 0; j < N; j++) {
            printf("(%9.2f, %-9.2f) ", CREAL(C[i + j * ldc]), CIMAG(C[i + j * ldc]));
        }
        printf("\n");
    }
}
Exemplo n.º 11
0
void write_output(imf_list_t list,mxArray *plhs[]) {
  double *rout,*iout,*out2;
  imf_t *current;
  int i=0,j,m=list.m,n=list.n;
  plhs[0]=mxCreateDoubleMatrix(m,n,mxCOMPLEX);
  rout=mxGetPr(plhs[0]);
  iout=mxGetPi(plhs[0]);
  plhs[1]=mxCreateDoubleMatrix(1,m-1,mxCOMPLEX);
  out2=mxGetPr(plhs[1]);
  for (current=list.first;current;current=current->next) {
    for (j=0;j<n;j++) {
      *(rout+j*m+i)=CREAL(current->pointer[j]);
      *(iout+j*m+i)=CIMAG(current->pointer[j]);
    }
    if (i<m-1) *(out2+i)=current->nb_iterations;
    i++;
  }
}
Exemplo n.º 12
0
template <typename ElemType> nano_time_t
Her2kPerformanceTest<ElemType>::etalonPerfSingle(void)
{
    nano_time_t time = 0;
    clblasOrder order;
    clblasUplo fUplo;
    clblasTranspose fTransA;
    ElemType fAlpha;

#ifndef PERF_TEST_WITH_ROW_MAJOR
    if (params_.order == clblasRowMajor) {
        cerr << "Row major order is not allowed" << endl;
        return NANOTIME_ERR;
    }
#endif

    memcpy(C_, backC_, params_.rowsC * params_.columnsC * sizeof(ElemType));
    order = params_.order;
    fUplo = params_.uplo;
    fTransA = params_.transA;
    fAlpha = alpha_;

    if (order != clblasColumnMajor)
    {
        CIMAG( fAlpha ) *= -1.0;
        fTransA = (params_.transA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans;
        fUplo   = (params_.uplo == clblasUpper) ? clblasLower : clblasUpper;
    }


#ifdef PERF_TEST_WITH_ACML

    time = getCurrentTime();
    clMath::blas::her2k(clblasColumnMajor, fUplo, fTransA, params_.N, params_.K, fAlpha,
                        A_, 0, params_.lda, B_, 0, params_.ldb, CREAL( beta_), C_, 0, params_.ldc);
    time = getCurrentTime() - time;

#endif  // PERF_TEST_WITH_ACML

    return time;
}
Exemplo n.º 13
0
static void
printResult(void)
{
    size_t i, j, off;
    printf("\nResult:\n");

    off = 0;
    for (i = 0; i < N; i++) {
        for(j = 0; j < N; j++)  {
            if( ( (uplo == clblasUpper) && (i > j)) || ((uplo == clblasLower) && (j > i)) )
            {
                printf("\t\t\t");
                continue;
            }

			printf("(%9.2lf, %-9.2lf)\t", CREAL( AP[ off ] ), CIMAG( AP[ off ] ));
            off ++ ;
        }
		printf("\n");
    }
}
Exemplo n.º 14
0
//HER2
void
printTestParams(
    clblasOrder order,
    clblasUplo  uplo,
    size_t N,
    bool useAlpha,
    cl_float2 alpha,
    size_t offx,
    int incx,
    size_t offy,
    int incy,
    size_t offa,
    size_t lda)
{
    ::std::cerr << orderStr(order) << ", " << uploStr(uplo) << ::std::endl;
    ::std::cerr << "N = " << N << ", offx = " << offx << ", incx = " << incx << ::std::endl;
    ::std::cerr << "offy = " << offy << ", incy = " << incy << ::std::endl;
    ::std::cerr << "offa = " << offa << ::std::endl;
    if( lda )
        ::std::cerr << ", lda = " << lda << ::std::endl;
        if(useAlpha)
    ::std::cerr << "alpha = (" << CREAL(alpha) << ", " << CIMAG(alpha) << ")" << ::std::endl;
}
Exemplo n.º 15
0
int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *buffer) {

    BLASLONG i, is, min_i;
#if (TRANSA == 2) || (TRANSA == 4)
    OPENBLAS_COMPLEX_FLOAT temp;
#endif
#ifndef UNIT
    FLOAT atemp1, atemp2, btemp1, btemp2;
#endif
    FLOAT *gemvbuffer = (FLOAT *)buffer;
    FLOAT *B = b;

    if (incb != 1) {
        B = buffer;
        gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
        COPY_K(m, b, incb, buffer, 1);
    }

    for (is =0; is < m; is += DTB_ENTRIES) {

        min_i = MIN(m - is, DTB_ENTRIES);

#if (TRANSA) == 1 || (TRANSA == 3)
        if (is > 0) {
#if   TRANSA == 1
            GEMV_N(is, min_i, 0, dp1, ZERO,
                   a + is * lda * 2, lda,
                   B + is       * 2, 1,
                   B,                1, gemvbuffer);
#else
            GEMV_R(is, min_i, 0, dp1, ZERO,
                   a + is * lda * 2, lda,
                   B + is       * 2, 1,
                   B,                1, gemvbuffer);
#endif
        }
#endif

        for (i = 0; i < min_i; i++) {
            FLOAT *AA = a + (is + (i + is) * lda) * 2;
            FLOAT *BB = B + is * 2;

#if (TRANSA == 1) || (TRANSA == 3)
#if   TRANSA == 1
            if (i > 0) AXPYU_K (i, 0, 0, BB[i * 2 + 0],  BB[i * 2 + 1],
                                    AA, 1, BB, 1, NULL, 0);
#else
            if (i > 0) AXPYC_K(i, 0, 0, BB[i * 2 + 0],  BB[i * 2 + 1],
                                   AA, 1, BB, 1, NULL, 0);
#endif
#endif

#ifndef UNIT
            atemp1 = AA[i * 2 + 0];
            atemp2 = AA[i * 2 + 1];

            btemp1 = BB[i * 2 + 0];
            btemp2 = BB[i * 2 + 1];

#if (TRANSA == 1) || (TRANSA == 2)
            BB[i * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2;
            BB[i * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1;
#else
            BB[i * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2;
            BB[i * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1;
#endif
#endif

#if (TRANSA == 2) || (TRANSA == 4)
            if (i < min_i - 1) {
#if TRANSA == 2
                temp = DOTU_K(min_i - i - 1,
                              AA + (i + 1) * 2, 1,
                              BB + (i + 1) * 2, 1);
#else
                temp = DOTC_K(min_i - i - 1,
                              AA + (i + 1) * 2, 1,
                              BB + (i + 1) * 2, 1);
#endif

                BB[i * 2 + 0] += CREAL(temp);
                BB[i * 2 + 1] += CIMAG(temp);
            }
#endif

        }

#if (TRANSA) == 2 || (TRANSA == 4)
        if (m - is > min_i) {
#if TRANSA == 2
            GEMV_T(m - is - min_i, min_i, 0, dp1, ZERO,
                   a + (is + min_i + is * lda) * 2,  lda,
                   B + (is + min_i) * 2, 1,
                   B +  is            * 2, 1, gemvbuffer);
#else
            GEMV_C(m - is - min_i, min_i, 0, dp1, ZERO,
                   a + (is + min_i + is * lda) * 2,  lda,
                   B + (is + min_i) * 2, 1,
                   B +  is            * 2, 1, gemvbuffer);
#endif
        }
#endif
    }

    if (incb != 1) {
        COPY_K(m, buffer, 1, b, incb);
    }

    return 0;
}
void fastsum_benchomp_createdataset(unsigned int d, int L, int M)
{
  int t, j, k;
  R *x;
  R *y;
  C *alpha;

  x = (R*) NFFT(malloc)((size_t)(d * L) * sizeof(R));
  y = (R*) NFFT(malloc)((size_t)(d * L) * sizeof(R));
  alpha = (C*) NFFT(malloc)((size_t)(L) * sizeof(C));

  /** init source knots in a d-ball with radius 1 */
  k = 0;
  while (k < L)
  {
    R r_max = K(1.0);
    R r2 = K(0.0);

    for (j = 0; j < d; j++)
      x[k * d + j] = K(2.0) * r_max * NFFT(drand48)() - r_max;

    for (j = 0; j < d; j++)
      r2 += x[k * d + j] * x[k * d + j];

    if (r2 >= r_max * r_max)
      continue;

    k++;
  }

  NFFT(vrand_unit_complex)(alpha, L);

  /** init target knots in a d-ball with radius 1 */
  k = 0;
  while (k < M)
  {
    R r_max = K(1.0);
    R r2 = K(0.0);

    for (j = 0; j < d; j++)
      y[k * d + j] = K(2.0) * r_max * NFFT(drand48)() - r_max;

    for (j = 0; j < d; j++)
      r2 += y[k * d + j] * y[k * d + j];

    if (r2 >= r_max * r_max)
      continue;

    k++;
  }

  printf("%d %d %d\n", d, L, M);

  for (j = 0; j < L; j++)
  {
    for (t = 0; t < d; t++)
      printf("%.16" __FES__ " ", x[d * j + t]);
    printf("\n");
  }

  for (j = 0; j < L; j++)
    printf("%.16" __FES__ " %.16" __FES__ "\n", CREAL(alpha[j]), CIMAG(alpha[j]));

  for (j = 0; j < M; j++)
  {
    for (t = 0; t < d; t++)
      printf("%.16" __FES__ " ", y[d * j + t]);
    printf("\n");
  }

  NFFT(free)(x);
  NFFT(free)(y);
  NFFT(free)(alpha);
}
Exemplo n.º 17
0
int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){

  BLASLONG i;
#if (TRANSA == 2) || (TRANSA == 4)
  FLOAT _Complex temp;
#endif
#ifndef UNIT
  FLOAT atemp1, atemp2, btemp1, btemp2;
#endif
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;

  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
    COPY_K(m, b, incb, buffer, 1);
  }

  a += (m + 1) * m - 2;

  for (i = 0; i < m; i++) {

#if (TRANSA == 1) || (TRANSA == 3)
#if TRANSA == 1
      if (i > 0) AXPYU_K (i, 0, 0,
			  B[(m - i - 1) * 2 + 0], B[(m - i - 1) * 2 + 1],
			  a + 2, 1, B + (m - i) * 2, 1, NULL, 0);
#else
      if (i > 0) AXPYC_K(i, 0, 0,
			  B[(m - i - 1) * 2 + 0], B[(m - i - 1) * 2 + 1],
			  a + 2, 1, B + (m - i) * 2, 1, NULL, 0);
#endif
#endif

#ifndef UNIT
      atemp1 = a[0];
      atemp2 = a[1];

      btemp1 = B[(m - i - 1) * 2 + 0];
      btemp2 = B[(m - i - 1) * 2 + 1];

#if (TRANSA == 1) || (TRANSA == 2)
      B[(m - i - 1) * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2;
      B[(m - i - 1) * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1;
#else
      B[(m - i - 1) * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2;
      B[(m - i - 1) * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1;
#endif
#endif

#if (TRANSA == 2) || (TRANSA == 4)
      if (i < m - 1) {
#if   TRANSA == 2
	temp = DOTU_K(m - i - 1, a - (m - i - 1) * 2, 1, B, 1);
#else
	temp = DOTC_K(m - i - 1, a - (m - i - 1) * 2, 1, B, 1);
#endif

	B[(m - i - 1) * 2 + 0] += CREAL(temp);
	B[(m - i - 1) * 2 + 1] += CIMAG(temp);
      }
#endif

#if (TRANSA == 1) || (TRANSA == 3)
    a -= (i + 2) * 2;
#else
    a -= (m - i) * 2;
#endif

    }


  if (incb != 1) {
    COPY_K(m, buffer, 1, b, incb);
  }

  return 0;
}
Exemplo n.º 18
0
int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, 
	 FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){

  BLASLONG i;
  FLOAT *X = x;
  FLOAT *Y = y;
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *bufferY    = gemvbuffer;
  FLOAT *bufferX    = gemvbuffer;
  FLOAT temp[2];

  if (incy != 1) {
    Y = bufferY;
    bufferX    = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
    gemvbuffer = bufferX;
    COPY_K(m, y, incy, Y, 1);
  }

  if (incx != 1) {
    X = bufferX;
    gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
    COPY_K(m, x, incx, X, 1);
  }

  for (i = 0; i < m; i++) {

#ifndef HEMVREV
#ifndef LOWER
    if (i > 0) {
      FLOAT _Complex result = DOTC_K(i, a, 1, X, 1);
      
      Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result);
      Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result);
    }

    temp[0] = a[i * 2 + 0] * X[i * 2 + 0];
    temp[1] = a[i * 2 + 0] * X[i * 2 + 1];

    Y[i * 2 + 0] +=  alpha_r * temp[0] - alpha_i * temp[1];
    Y[i * 2 + 1] +=  alpha_r * temp[1] + alpha_i * temp[0];

    if (i > 0) {
      AXPYU_K(i, 0, 0,
	      alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], 
	      alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], 
	      a, 1, Y, 1, NULL, 0);
    }

    a += (i + 1) * 2;
  
#else

    if (m - i > 1) {
      FLOAT _Complex result = DOTC_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1);
      
      Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result);
      Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result);
    }

    temp[0] = a[i * 2 + 0] * X[i * 2 + 0];
    temp[1] = a[i * 2 + 0] * X[i * 2 + 1];

    Y[i * 2 + 0] +=  alpha_r * temp[0] - alpha_i * temp[1];
    Y[i * 2 + 1] +=  alpha_r * temp[1] + alpha_i * temp[0];

    if (m - i > 1) {
      AXPYU_K(m - i - 1, 0, 0,
	      alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], 
	      alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], 
	      a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0);
    }

    a += (m - i - 1) * 2;

#endif
#else
#ifndef LOWER
    if (i > 0) {
      FLOAT _Complex result = DOTU_K(i, a, 1, X, 1);
      
      Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result);
      Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result);
    }

    temp[0] = a[i * 2 + 0] * X[i * 2 + 0];
    temp[1] = a[i * 2 + 0] * X[i * 2 + 1];

    Y[i * 2 + 0] +=  alpha_r * temp[0] - alpha_i * temp[1];
    Y[i * 2 + 1] +=  alpha_r * temp[1] + alpha_i * temp[0];

    if (i > 0) {
      AXPYC_K(i, 0, 0,
	      alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], 
	      alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], 
	      a, 1, Y, 1, NULL, 0);
    }

    a += (i + 1) * 2;
  
#else

    if (m - i > 1) {
      FLOAT _Complex result = DOTU_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1);
      
      Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result);
      Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result);
    }

    temp[0] = a[i * 2 + 0] * X[i * 2 + 0];
    temp[1] = a[i * 2 + 0] * X[i * 2 + 1];

    Y[i * 2 + 0] +=  alpha_r * temp[0] - alpha_i * temp[1];
    Y[i * 2 + 1] +=  alpha_r * temp[1] + alpha_i * temp[0];

    if (m - i > 1) {
      AXPYC_K(m - i - 1, 0, 0,
	      alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], 
	      alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], 
	      a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0);
    }

    a += (m - i - 1) * 2;

#endif
#endif

  }
    
  if (incy != 1) {
    COPY_K(m, Y, 1, y, incy);
  }

  return 0;
}
Exemplo n.º 19
0
static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){

  FLOAT *a, *x, *y;
  BLASLONG lda, incx;
  BLASLONG n, k, n_from, n_to;
  BLASLONG i, length;
#ifndef COMPLEX
  FLOAT result;
#else
  FLOAT _Complex result;
#endif

  a = (FLOAT *)args -> a;
  x = (FLOAT *)args -> b;

  lda  = args -> lda;
  incx = args -> ldb;

  n = args -> n;
  k = args -> k;

  n_from = 0;
  n_to   = n;

  //Use y as each thread's n* COMPSIZE elements in sb buffer
  y = buffer;   
  buffer += ((COMPSIZE * n  + 1023) & ~1023);

  if (range_m) {
    n_from = *(range_m + 0);
    n_to   = *(range_m + 1);

    a += n_from * lda  * COMPSIZE;
  }


  if (incx != 1) {
    COPY_K(n, x, incx, buffer, 1);

    x = buffer;
    buffer += ((COMPSIZE * n  + 1023) & ~1023);
  }

  SCAL_K(n, 0, 0, ZERO, 
#ifdef COMPLEX
	 ZERO,
#endif
	 y, 1, NULL, 0, NULL, 0);  
  
  for (i = n_from; i < n_to; i++) {

#ifndef LOWER

    length  = i;
    if (length > k) length = k;

    MYAXPY(length, 0, 0,
	   *(x + i * COMPSIZE + 0),
#ifdef COMPLEX
	   *(x + i * COMPSIZE + 1),
#endif
	   a + (k - length) * COMPSIZE, 1, y + (i - length) * COMPSIZE, 1, NULL, 0);

#if !defined(HEMV) && !defined(HEMVREV)
    result = MYDOT(length + 1, a + (k - length) * COMPSIZE, 1, x + (i - length) * COMPSIZE, 1);
#else
    result = MYDOT(length    , a + (k - length) * COMPSIZE, 1, x + (i - length) * COMPSIZE, 1);
#endif

#ifndef COMPLEX
    *(y + i * COMPSIZE + 0) += result;
#else
#if !defined(HEMV) && !defined(HEMVREV)
    *(y + i * COMPSIZE + 0) += CREAL(result);
    *(y + i * COMPSIZE + 1) += CIMAG(result);
#else
    *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + k * COMPSIZE) * *(x + i * COMPSIZE + 0);
    *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + k * COMPSIZE) * *(x + i * COMPSIZE + 1);
#endif
#endif

#else

    length  = k;
    if (n - i - 1 < k) length = n - i - 1;

    MYAXPY(length, 0, 0,
	   *(x + i * COMPSIZE + 0),
#ifdef COMPLEX
	   *(x + i * COMPSIZE + 1),
#endif
	   a + COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0);

#if !defined(HEMV) && !defined(HEMVREV)
    result = MYDOT(length + 1, a, 1, x + i * COMPSIZE, 1);
#else
    result = MYDOT(length    , a + COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1) ;
#endif

#ifndef COMPLEX
    *(y + i * COMPSIZE + 0) += result;
#else
#if !defined(HEMV) && !defined(HEMVREV)
    *(y + i * COMPSIZE + 0) += CREAL(result);
    *(y + i * COMPSIZE + 1) += CIMAG(result);
#else
    *(y + i * COMPSIZE + 0) += CREAL(result) + *a * *(x + i * COMPSIZE + 0);
    *(y + i * COMPSIZE + 1) += CIMAG(result) + *a * *(x + i * COMPSIZE + 1);
#endif
#endif

#endif

    a += lda * COMPSIZE;
  }

  return 0;
}
Exemplo n.º 20
0
static int gbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){

  FLOAT *a, *x, *y;
  BLASLONG lda, incx;
  BLASLONG n_from, n_to;
  BLASLONG i, offset_l, offset_u, uu, ll, ku, kl;
#ifdef TRANSA
#ifndef COMPLEX
  FLOAT result;
#else
  OPENBLAS_COMPLEX_FLOAT result;
#endif
#endif

  a = (FLOAT *)args -> a;
  x = (FLOAT *)args -> b;
  y = (FLOAT *)args -> c;

  lda  = args -> lda;
  incx = args -> ldb;
  ku   = args -> ldc;
  kl   = args -> ldd;

  n_from = 0;
  n_to   = args -> n;

  if (range_m) y += *range_m * COMPSIZE;

  if (range_n) {
    n_from = *(range_n + 0);
    n_to   = *(range_n + 1);

    a += n_from * lda  * COMPSIZE;
  }

  n_to = MIN(n_to, args -> m + ku);

#ifdef TRANSA
  if (incx != 1) {
    COPY_K(args -> m, x, incx, buffer, 1);

    x = buffer;
    buffer += ((COMPSIZE * args -> m  + 1023) & ~1023);
  }
#endif

  SCAL_K(
#ifndef TRANSA
	 args -> m,
#else
	 args -> n,
#endif
	 0, 0, ZERO,
#ifdef COMPLEX
	 ZERO,
#endif
	 y, 1, NULL, 0, NULL, 0);

  offset_u = ku - n_from;
  offset_l = ku - n_from + args -> m;

#ifndef TRANSA
  x += n_from * incx * COMPSIZE;
  y -= offset_u      * COMPSIZE;
#else
  x -= offset_u      * COMPSIZE;
  y += n_from        * COMPSIZE;
#endif

  for (i = n_from; i < n_to; i++) {

    uu = MAX(offset_u, 0);
    ll = MIN(offset_l, ku + kl + 1);

#ifndef TRANSA
    MYAXPY(ll - uu, 0, 0,
	    *(x + 0),
#ifdef COMPLEX
#ifndef XCONJ
	     *(x + 1),
#else
	    -*(x + 1),
#endif
#endif
	    a + uu * COMPSIZE, 1, y + uu * COMPSIZE, 1, NULL, 0);

    x += incx * COMPSIZE;
#else
    result = MYDOT(ll - uu, a + uu * COMPSIZE, 1, x + uu * COMPSIZE, 1);

#ifndef COMPLEX
    *y = result;
#else
    *(y + 0) += CREAL(result);
#ifndef XCONJ
    *(y + 1) += CIMAG(result);
#else
    *(y + 1) -= CIMAG(result);
#endif
#endif

    x += COMPSIZE;
#endif

    y += COMPSIZE;

    offset_u --;
    offset_l --;

    a += lda * COMPSIZE;
  }

  return 0;
}
Exemplo n.º 21
0
clblasStatus
doHer2k(
    CLBlasKargs *kargs,
    clblasOrder order,
    clblasUplo uplo,
    clblasTranspose transA,
    size_t N,
    size_t K,
    const cl_mem A,
    size_t offa,
    size_t lda,
    const cl_mem B,
    size_t offb,
    size_t ldb,
    cl_mem C,
    size_t offc,
    size_t ldc,
    cl_uint numCommandQueues,
    cl_command_queue *commandQueues,
    cl_uint numEventsInWaitList,
    const cl_event *eventWaitList,
    cl_event *events)
{
    clblasStatus err;
    clblasUplo fUplo;
    clblasTranspose fTransA;
    cl_event firstHerkCall;
    clblasStatus retCode = clblasSuccess;

    if (!clblasInitialized) {
        return clblasNotInitialized;
    }

    if (numCommandQueues == 0 || commandQueues == NULL) {
        return clblasInvalidValue;
    }
    numCommandQueues = 1;

    if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
    {
        return clblasInvalidEventWaitList;
    }

    // Validate arguments
    if (retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET )) {
        return retCode;
    }

    if (transA == clblasTrans) {
        return clblasInvalidValue;
    }

    if (retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offa, lda, A_MAT_ERRSET )) {
        return retCode;
    }

    if (retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, B, offb, ldb, B_MAT_ERRSET )) {
        return retCode;
    }

    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, C, offc, ldc, C_MAT_ERRSET )) {
        return retCode;
    }

    if ((numEventsInWaitList !=0) && (eventWaitList == NULL))
    {
        return clblasInvalidEventWaitList;
    }

    fUplo = (order == clblasRowMajor) ? ((uplo == clblasLower) ? clblasUpper : clblasLower) : uplo;
    fTransA = (order == clblasRowMajor) ? ((transA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans) : transA;
    kargs->order = (order == clblasRowMajor) ? clblasColumnMajor : order;

    kargs->transA = fTransA;
    kargs->transB = (fTransA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans;

    kargs->uplo = fUplo;
    kargs->M = N;
    kargs->N = N;
    kargs->K = K;
    kargs->A = A;
    kargs->offA = offa;
    kargs->offa = offa;
    kargs->lda.matrix = lda;
    kargs->B = B;
    kargs->offBX = offb;
    kargs->ldb.matrix = ldb;
    kargs->C = C;
    kargs->offCY = offc;
    kargs->ldc.matrix = ldc;
    kargs->pigFuncID = CLBLAS_HERK;

    err = executeGEMM(kargs,  numCommandQueues, commandQueues,
                            numEventsInWaitList, eventWaitList, &firstHerkCall);

    if( err == CL_SUCCESS )
    {
        kargs->A = B;
        kargs->offA = offb;
        kargs->offa = offb;
        kargs->lda.matrix = ldb;
        kargs->B = A;
        kargs->offBX = offa;
        kargs->ldb.matrix = lda;

        if( kargs->dtype == TYPE_COMPLEX_FLOAT )
        {
            CIMAG( kargs->alpha.argFloatComplex ) *= -1.0;
            CREAL( kargs->beta.argFloatComplex ) = 1.0;
            CIMAG( kargs->beta.argFloatComplex ) = 0.0;
        }
        else
        {
            CIMAG( kargs->alpha.argDoubleComplex ) *= -1.0;
            CREAL( kargs->beta.argDoubleComplex ) = 1.0;
            CIMAG( kargs->beta.argDoubleComplex ) = 0.0;
        }

        err = executeGEMM(kargs,  numCommandQueues, commandQueues, 1, &firstHerkCall, events);
    }

    return (clblasStatus)err;
}
Exemplo n.º 22
0
int main(int argc, char **argv)
{
  int j, k, t; /**< indices                 */
  int d; /**< number of dimensions    */
  int N; /**< number of source nodes  */
  int M; /**< number of target nodes  */
  int n; /**< expansion degree        */
  int m; /**< cut-off parameter       */
  int p; /**< degree of smoothness    */
  const char *s; /**< name of kernel          */
  C (*kernel)(R, int, const R *); /**< kernel function         */
  R c; /**< parameter for kernel    */
  fastsum_plan my_fastsum_plan; /**< plan for fast summation */
  C *direct; /**< array for direct computation */
  ticks t0, t1; /**< for time measurement    */
  R time; /**< for time measurement    */
  R error = K(0.0); /**< for error computation   */
  R eps_I; /**< inner boundary          */
  R eps_B; /**< outer boundary          */
  FILE *fid1, *fid2;
  R temp;

  if (argc != 11)
  {
    printf("\nfastsum_test d N M n m p kernel c\n\n");
    printf("  d       dimension                 \n");
    printf("  N       number of source nodes    \n");
    printf("  M       number of target nodes    \n");
    printf("  n       expansion degree          \n");
    printf("  m       cut-off parameter         \n");
    printf("  p       degree of smoothness      \n");
    printf("  kernel  kernel function  (e.g., gaussian)\n");
    printf("  c       kernel parameter          \n");
    printf("  eps_I   inner boundary            \n");
    printf("  eps_B   outer boundary            \n\n");
    exit(-1);
  }
  else
  {
    d = atoi(argv[1]);
    N = atoi(argv[2]);
    c = K(1.0) / POW((R)(N), K(1.0) / ((R)(d)));
    M = atoi(argv[3]);
    n = atoi(argv[4]);
    m = atoi(argv[5]);
    p = atoi(argv[6]);
    s = argv[7];
    c = (R)(atof(argv[8]));
    eps_I = (R)(atof(argv[9]));
    eps_B = (R)(atof(argv[10]));
    if (strcmp(s, "gaussian") == 0)
      kernel = gaussian;
    else if (strcmp(s, "multiquadric") == 0)
      kernel = multiquadric;
    else if (strcmp(s, "inverse_multiquadric") == 0)
      kernel = inverse_multiquadric;
    else if (strcmp(s, "logarithm") == 0)
      kernel = logarithm;
    else if (strcmp(s, "thinplate_spline") == 0)
      kernel = thinplate_spline;
    else if (strcmp(s, "one_over_square") == 0)
      kernel = one_over_square;
    else if (strcmp(s, "one_over_modulus") == 0)
      kernel = one_over_modulus;
    else if (strcmp(s, "one_over_x") == 0)
      kernel = one_over_x;
    else if (strcmp(s, "inverse_multiquadric3") == 0)
      kernel = inverse_multiquadric3;
    else if (strcmp(s, "sinc_kernel") == 0)
      kernel = sinc_kernel;
    else if (strcmp(s, "cosc") == 0)
      kernel = cosc;
    else if (strcmp(s, "cot") == 0)
      kernel = kcot;
    else
    {
      s = "multiquadric";
      kernel = multiquadric;
    }
  }
  printf(
      "d=%d, N=%d, M=%d, n=%d, m=%d, p=%d, kernel=%s, c=%" __FGS__ ", eps_I=%" __FGS__ ", eps_B=%" __FGS__ " \n",
      d, N, M, n, m, p, s, c, eps_I, eps_B);

  /** init two dimensional fastsum plan */
  fastsum_init_guru(&my_fastsum_plan, d, N, M, kernel, &c, 0, n, m, p, eps_I,
      eps_B);
  /*fastsum_init_guru(&my_fastsum_plan, d, N, M, kernel, &c, EXACT_NEARFIELD, n, m, p);*/

  /** load source knots and coefficients */
  fid1 = fopen("x.dat", "r");
  fid2 = fopen("alpha.dat", "r");
  for (k = 0; k < N; k++)
  {
    for (t = 0; t < d; t++)
    {
      fscanf(fid1, __FR__, &my_fastsum_plan.x[k * d + t]);
    }
    fscanf(fid2, __FR__, &temp);
    my_fastsum_plan.alpha[k] = temp;
    fscanf(fid2, __FR__, &temp);
    my_fastsum_plan.alpha[k] += temp * II;
  }
  fclose(fid1);
  fclose(fid2);

  /** load target knots */
  fid1 = fopen("y.dat", "r");
  for (j = 0; j < M; j++)
  {
    for (t = 0; t < d; t++)
    {
      fscanf(fid1, __FR__, &my_fastsum_plan.y[j * d + t]);
    }
  }
  fclose(fid1);

  /** direct computation */
  printf("direct computation: ");
  fflush(NULL);
  t0 = getticks();
  fastsum_exact(&my_fastsum_plan);
  t1 = getticks();
  time = NFFT(elapsed_seconds)(t1, t0);
  printf(__FI__ "sec\n", time);

  /** copy result */
  direct = (C *) NFFT(malloc)((size_t)(my_fastsum_plan.M_total) * (sizeof(C)));
  for (j = 0; j < my_fastsum_plan.M_total; j++)
    direct[j] = my_fastsum_plan.f[j];

  /** precomputation */
  printf("pre-computation:    ");
  fflush(NULL);
  t0 = getticks();
  fastsum_precompute(&my_fastsum_plan);
  t1 = getticks();
  time = NFFT(elapsed_seconds)(t1, t0);
  printf(__FI__ "sec\n", time);

  /** fast computation */
  printf("fast computation:   ");
  fflush(NULL);
  t0 = getticks();
  fastsum_trafo(&my_fastsum_plan);
  t1 = getticks();
  time = NFFT(elapsed_seconds)(t1, t0);
  printf(__FI__ "sec\n", time);

  /** compute max error */
  error = K(0.0);
  for (j = 0; j < my_fastsum_plan.M_total; j++)
  {
    if (CABS(direct[j] - my_fastsum_plan.f[j]) / CABS(direct[j]) > error)
      error = CABS(direct[j] - my_fastsum_plan.f[j]) / CABS(direct[j]);
  }
  printf("max relative error: " __FE__ "\n", error);

  /** write result to file */
  fid1 = fopen("f.dat", "w+");
  fid2 = fopen("f_direct.dat", "w+");
  if (fid1 == NULL)
  {
    printf("Fehler!\n");
    exit(EXIT_FAILURE);
  }
  for (j = 0; j < M; j++)
  {
    temp = CREAL(my_fastsum_plan.f[j]);
    fprintf(fid1, "  % .16" __FES__ "", temp);
    temp = CIMAG(my_fastsum_plan.f[j]);
    fprintf(fid1, "  % .16" __FES__ "\n", temp);

    temp = CREAL(direct[j]);
    fprintf(fid2, "  % .16" __FES__ "", temp);
    temp = CIMAG(direct[j]);
    fprintf(fid2, "  % .16" __FES__ "\n", temp);
  }
  fclose(fid1);
  fclose(fid2);

  /** finalise the plan */
  fastsum_finalize(&my_fastsum_plan);

  return EXIT_SUCCESS;
}
Exemplo n.º 23
0
void
her2kCorrectnessTest(TestParams *params)
{
    cl_int err;
    T *A, *B, *blasC, *clblasC;
    T alpha, beta;
    cl_mem bufA, bufC, bufB;
    clMath::BlasBase *base;
    cl_event *events;

    if (params->transA == clblasTrans) {
        ::std::cerr << ">> her2k(TRANSPOSE) for complex numbers "
                           "is not allowed." << ::std::endl <<
                           ">> Test skipped." << ::std::endl;
            SUCCEED();
            return;
        }

    base = clMath::BlasBase::getInstance();

    if ((typeid(T) == typeid(cl_double) ||
         typeid(T) == typeid(DoubleComplex)) &&
        !base->isDevSupportDoublePrecision()) {

        std::cerr << ">> WARNING: The target device doesn't support native "
                     "double precision floating point arithmetic" <<
                     std::endl << ">> Test skipped" << std::endl;
        SUCCEED();
        return;
    }

    events = new cl_event[params->numCommandQueues];
    memset(events, 0, params->numCommandQueues * sizeof(cl_event));

    A = new T[params->rowsA * params->columnsA];
    B = new T[params->rowsB * params->columnsB];
    blasC = new T[params->rowsC * params->columnsC];
    clblasC = new T[params->rowsC * params->columnsC];

	if((A == NULL) || (B == NULL) || (blasC == NULL) || (clblasC == NULL))
	{
		deleteBuffers<T>(A, B, blasC, clblasC);
		::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl;
        delete[] events;
        SUCCEED();
        return;
	}

    srand(params->seed);

    alpha = convertMultiplier<T>(params->alpha);
    beta  = convertMultiplier<T>(params->beta);

    ::std::cerr << "Generating input data... ";

    clblasTranspose ftransB = (params->transA==clblasNoTrans)? clblasConjTrans: clblasNoTrans;

    randomGemmMatrices<T>(params->order, params->transA, ftransB,
                                params->N, params->N, params->K, true, &alpha, A, params->lda,
                                B, params->ldb, true, &beta, blasC, params->ldc);

    memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC));
    ::std::cerr << "Done" << ::std::endl;

    bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A),
                                     CL_MEM_READ_ONLY);
    bufB = base->createEnqueueBuffer(B, params->rowsB * params->columnsB * sizeof(*B), params->offBX * sizeof(*B),
                                     CL_MEM_READ_ONLY);
    bufC = base->createEnqueueBuffer(clblasC, params->rowsC * params->columnsC * sizeof(*clblasC),
                                     params->offCY * sizeof(*clblasC),
                                     CL_MEM_READ_WRITE);

    if ((bufA == NULL) || (bufB == NULL)|| (bufC == NULL)) {
        /* Skip the test, the most probable reason is
         *     matrix too big for a device.
         */
        releaseMemObjects(bufA, bufB, bufC);
        deleteBuffers<T>(A, B, blasC, clblasC);
        delete[] events;
        ::std::cerr << ">> Failed to create/enqueue buffer for a matrix."
            << ::std::endl
            << ">> Can't execute the test, because data is not transfered to GPU."
            << ::std::endl
            << ">> Test skipped." << ::std::endl;
        SUCCEED();
        return;
    }

    ::std::cerr << "Calling reference xHER2K routine... ";
    T fAlpha = alpha;
    if (params->order == clblasColumnMajor) {
        ::clMath::blas::her2k(clblasColumnMajor, params->uplo, params->transA,
                                params->N, params->K, fAlpha, A, 0, params->lda, B, 0, params->ldb,
                                CREAL(beta), blasC, 0, params->ldc);
    }
    else {

		CIMAG( fAlpha ) *= -1.0;        // According to netlib C- interface
        clblasTranspose fTransA = (params->transA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans;
		clblasUplo      fUplo   = (params->uplo == clblasUpper) ? clblasLower : clblasUpper;

		::clMath::blas::her2k(clblasColumnMajor, fUplo, fTransA, params->N, params->K, fAlpha,
						        A, 0, params->lda, B, 0, params->ldb, CREAL(beta), blasC, 0, params->ldc);

    }
    ::std::cerr << "Done" << ::std::endl;

    ::std::cerr << "Calling clblas xHER2K routine... ";
    err = (cl_int)::clMath::clblas::her2k(params->order, params->uplo,
                                         params->transA, params->N, params->K,
                                         alpha, bufA, params->offA, params->lda, bufB, params->offBX, params->ldb,
                                         CREAL(beta), bufC, params->offCY,
                                         params->ldc, params->numCommandQueues,
                                         base->commandQueues(), 0, NULL,
                                         events);
    if (err != CL_SUCCESS) {
        releaseMemObjects(bufA, bufB, bufC);
        deleteBuffers<T>(A, B, blasC, clblasC);
        delete[] events;
        ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HER2K() failed";
    }

    err = waitForSuccessfulFinish(params->numCommandQueues,
        base->commandQueues(), events);
    if (err != CL_SUCCESS) {
        releaseMemObjects(bufA, bufB, bufC);
        deleteBuffers<T>(A, B, blasC, clblasC);
        delete[] events;
        ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()";
    }
    ::std::cerr << "Done" << ::std::endl;

    clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, params->offCY * sizeof(*clblasC),
                        params->rowsC * params->columnsC * sizeof(*clblasC), clblasC, 0, NULL, NULL);

    releaseMemObjects(bufA, bufB, bufC);
    compareMatrices<T>(params->order, params->N, params->N, blasC, clblasC, params->ldc);

    deleteBuffers<T>(A, B, blasC, clblasC);
    delete[] events;
}
Exemplo n.º 24
0
int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
	  FLOAT *a, BLASLONG lda,
	  FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){

  BLASLONG i, length;
#ifndef LOWER
  BLASLONG offset;
#endif

  FLOAT *X = x;
  FLOAT *Y = y;
  FLOAT *sbmvbuffer = (FLOAT *)buffer;
  FLOAT *bufferY    = sbmvbuffer;
  FLOAT *bufferX    = sbmvbuffer;

  if (incy != 1) {
    Y = bufferY;
    bufferX    = (FLOAT *)(((BLASLONG)bufferY + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095);
    sbmvbuffer = bufferX;
    COPY_K(n, y, incy, Y, 1);
  }

  if (incx != 1) {
    X = bufferX;
    sbmvbuffer = (FLOAT *)(((BLASLONG)bufferX + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095);
    COPY_K(n, x, incx, X, 1);
  }

#ifndef LOWER
  offset = k;
#endif

  for (i = 0; i < n; i++) {

#ifndef LOWER
    length  = k - offset;

    AXPYU_K(length + 1, 0, 0,
	    alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], 
	    alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], 
	    a + offset * COMPSIZE, 1, Y + (i - length) * COMPSIZE, 1, NULL, 0);

    if (length > 0) {
      FLOAT _Complex result = DOTU_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1);

      Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result);
      Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result);
      }

    if (offset > 0) offset --;
#else
    length  = k;
    if (n - i - 1 < k) length = n - i - 1;

    AXPYU_K(length + 1, 0, 0,
	    alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], 
	    alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], 
	    a, 1, Y + i * COMPSIZE, 1, NULL, 0);

    if (length > 0) {
      FLOAT _Complex result = DOTU_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1);
      
      Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result);
      Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result);
      }
#endif
    
    a += lda * 2;
  }

  if (incy != 1) {
    COPY_K(n, Y, 1, y, incy);
  }

  return 0;
}
Exemplo n.º 25
0
static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){

  FLOAT *a, *x, *y;
  BLASLONG incx;
  BLASLONG m_from, m_to, i;
#ifndef COMPLEX
  FLOAT result;
#else
  OPENBLAS_COMPLEX_FLOAT result;
#endif

  a = (FLOAT *)args -> a;
  x = (FLOAT *)args -> b;
  y = (FLOAT *)args -> c;

  incx = args -> ldb;

  m_from = 0;
  m_to   = args -> m;

  if (range_m) {
    m_from = *(range_m + 0);
    m_to   = *(range_m + 1);
  }

  if (range_n) y += *range_n * COMPSIZE;

  if (incx != 1) {
#ifndef LOWER
    COPY_K(m_to, x, incx, buffer, 1);
#else
    COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1);
#endif

    x = buffer;
  }

#ifndef LOWER
  SCAL_K(m_to, 0, 0, ZERO,
#ifdef COMPLEX
	 ZERO,
#endif
	 y, 1, NULL, 0, NULL, 0);
#else
  SCAL_K(args -> m - m_from, 0, 0, ZERO,
#ifdef COMPLEX
	 ZERO,
#endif
	 y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);
#endif

#ifndef LOWER
  a += (m_from + 1) * m_from / 2  * COMPSIZE;
#else
  a += (2 * args -> m - m_from - 1) * m_from / 2  * COMPSIZE;
#endif

  for (i = m_from; i < m_to; i++) {
#ifndef LOWER

#if !defined(HEMV) && !defined(HEMVREV)
    result = MYDOT(i + 1, a, 1, x, 1);
#else
    result = MYDOT(i    , a, 1, x, 1);
#endif

#ifndef COMPLEX
    *(y + i * COMPSIZE) += result;
#else
#if !defined(HEMV) && !defined(HEMVREV)
    *(y + i * COMPSIZE + 0) += CREAL(result);
    *(y + i * COMPSIZE + 1) += CIMAG(result);
#else
    *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 0);
    *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 1);
#endif
#endif

    MYAXPY(i, 0, 0,
	    *(x + i * COMPSIZE + 0),
#ifdef COMPLEX
	    *(x + i * COMPSIZE + 1),
#endif
	    a, 1, y, 1, NULL, 0);

    a += (i + 1) * COMPSIZE;

#else
#if !defined(HEMV) && !defined(HEMVREV)
    result = MYDOT(args -> m - i    , a + i * COMPSIZE, 1, x + i * COMPSIZE, 1);
#else
    result = MYDOT(args -> m - i - 1, a + (i + 1) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1);
#endif

#ifndef COMPLEX
    *(y + i * COMPSIZE) += result;
#else
#if !defined(HEMV) && !defined(HEMVREV)
    *(y + i * COMPSIZE + 0) += CREAL(result);
    *(y + i * COMPSIZE + 1) += CIMAG(result);
#else
    *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 0);
    *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 1);
#endif
#endif

    MYAXPY(args -> m - i - 1, 0, 0,
	    *(x + i * COMPSIZE + 0),
#ifdef COMPLEX
	    *(x + i * COMPSIZE + 1),
#endif
	    a + (i + 1) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0);

    a += (args -> m - i - 1) * COMPSIZE;

#endif
  }

  return 0;
}
Exemplo n.º 26
0
int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){

  BLASLONG i, is, min_i;
#if (TRANSA == 2) || (TRANSA == 4)
  FLOAT _Complex result;
#endif
#ifndef UNIT
  FLOAT ar, ai, br, bi, ratio, den;
#endif
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;

  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
    COPY_K(m, b, incb, buffer, 1);
  }

  for (is = m; is > 0; is -= DTB_ENTRIES){

    min_i = MIN(is, DTB_ENTRIES);

#if (TRANSA == 2) || (TRANSA == 4)
    if (m - is > 0){
#if TRANSA == 2
      GEMV_T(m - is, min_i, 0, dm1, ZERO,
	      a + (is + (is - min_i)  * lda) * COMPSIZE, lda,
	      B +  is          * COMPSIZE, 1,
	      B + (is - min_i) * COMPSIZE, 1, gemvbuffer);
#else
      GEMV_C(m - is, min_i, 0, dm1, ZERO,
	      a + (is + (is - min_i)  * lda) * COMPSIZE, lda,
	      B +  is            * COMPSIZE, 1,
	      B + (is - min_i)  * COMPSIZE, 1, gemvbuffer);
#endif
    }
#endif

    for (i = 0; i < min_i; i++) {
      FLOAT *AA = a + ((is - i - 1) + (is - i - 1) * lda) * COMPSIZE;
      FLOAT *BB = B + (is - i - 1) * COMPSIZE;

#if (TRANSA == 2) || (TRANSA == 4)
      if (i > 0) {
#if TRANSA == 2
	result = DOTU_K(i, AA + 2, 1, BB + 2, 1);
#else
	result = DOTC_K(i, AA + 2, 1, BB + 2, 1);
#endif

      BB[0] -= CREAL(result);
      BB[1] -= CIMAG(result);
      }
#endif

#ifndef UNIT
      ar = AA[0];
      ai = AA[1];
      
      if (fabs(ar) >= fabs(ai)){
	ratio = ai / ar;
	den = 1./(ar * ( 1 + ratio * ratio));
	
	ar =  den;
#if TRANSA < 3
	ai = -ratio * den;
#else
	ai =  ratio * den;
#endif
      } else {
	ratio = ar / ai;
	den = 1./(ai * ( 1 + ratio * ratio));
	ar =  ratio * den;
#if TRANSA < 3
	ai = -den;
#else
	ai =  den;
#endif
    }

      br = BB[0];
      bi = BB[1];
      
      BB[0] = ar*br - ai*bi;
      BB[1] = ar*bi + ai*br;
#endif

#if (TRANSA == 1) || (TRANSA == 3)
      if (i < min_i - 1) {
#if TRANSA == 1
	AXPYU_K (min_i - i - 1, 0, 0, - BB[0], -BB[1],
		 AA - (min_i - i - 1) * COMPSIZE, 1, BB - (min_i - i - 1) * COMPSIZE, 1, NULL, 0);
#else
	AXPYC_K(min_i - i - 1, 0, 0, - BB[0], -BB[1],
		 AA - (min_i - i - 1) * COMPSIZE, 1, BB - (min_i - i - 1) * COMPSIZE, 1, NULL, 0);
#endif
      }
#endif
    }

#if (TRANSA == 1) || (TRANSA == 3)
    if (is - min_i > 0){
#if   TRANSA == 1
      GEMV_N(is - min_i, min_i, 0, dm1, ZERO,
	      a +  (is - min_i) * lda * COMPSIZE, lda,
	      B +  (is - min_i)       * COMPSIZE, 1,
	      B,                                  1, gemvbuffer);
#else
      GEMV_R(is - min_i, min_i, 0, dm1, ZERO,
	      a +  (is - min_i) * lda * COMPSIZE, lda,
	      B +  (is - min_i)       * COMPSIZE, 1,
	      B,                                  1, gemvbuffer);
#endif
    }
#endif
  }

  if (incb != 1) {
    COPY_K(m, buffer, 1, b, incb);
  }

  return 0;
}
Exemplo n.º 27
0
static int tpmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){

  FLOAT *a, *x, *y;

  BLASLONG incx;
  BLASLONG m_from, m_to;
  BLASLONG i;

#ifdef TRANS
#ifndef COMPLEX
  FLOAT          result;
#else
  OPENBLAS_COMPLEX_FLOAT result;
#endif
#endif

#if defined(COMPLEX) && !defined(UNIT)
  FLOAT ar, ai, xr, xi;
#endif

  a = (FLOAT *)args -> a;
  x = (FLOAT *)args -> b;
  y = (FLOAT *)args -> c;

  incx = args -> ldb;

  m_from = 0;
  m_to   = args -> m;

  if (range_m) {
    m_from = *(range_m + 0);
    m_to   = *(range_m + 1);
  }

  if (incx != 1) {

#ifndef LOWER
    COPY_K(m_to, x, incx, buffer, 1);
#else
    COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1);
#endif

    x = buffer;
    buffer += ((COMPSIZE * args -> m + 1023) & ~1023);
  }

#ifndef TRANS
  if (range_n) y += *range_n * COMPSIZE;

#ifndef LOWER
  SCAL_K(m_to, 0, 0, ZERO,
#ifdef COMPLEX
	 ZERO,
#endif
	 y, 1, NULL, 0, NULL, 0);
#else
  SCAL_K(args -> m - m_from, 0, 0, ZERO,
#ifdef COMPLEX
	 ZERO,
#endif
	 y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);
#endif

#else

  SCAL_K(m_to - m_from, 0, 0, ZERO,
#ifdef COMPLEX
	 ZERO,
#endif
	 y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);

#endif

#ifndef LOWER
  a += (m_from + 1) * m_from / 2  * COMPSIZE;
#else
  a += (2 * args -> m - m_from - 1) * m_from / 2  * COMPSIZE;
#endif

  for (i = m_from; i < m_to; i++) {

#ifndef LOWER
      if (i > 0) {
#ifndef TRANS
	MYAXPY(i, 0, 0,
		*(x + i * COMPSIZE + 0),
#ifdef COMPLEX
		*(x + i * COMPSIZE + 1),
#endif
		a, 1, y, 1, NULL, 0);
#else
       	result = MYDOT(i,  a, 1, x, 1);

#ifndef COMPLEX
	*(y + i * COMPSIZE + 0) += result;
#else
	*(y + i * COMPSIZE + 0) += CREAL(result);
	*(y + i * COMPSIZE + 1) += CIMAG(result);
#endif

#endif
      }
#endif

#ifndef COMPLEX
#ifdef UNIT
    *(y + i * COMPSIZE) += *(x + i * COMPSIZE);
#else
    *(y + i * COMPSIZE) += *(a + i * COMPSIZE) * *(x + i * COMPSIZE);
#endif
#else
#ifdef UNIT
      *(y + i * COMPSIZE + 0) += *(x + i * COMPSIZE + 0);
      *(y + i * COMPSIZE + 1) += *(x + i * COMPSIZE + 1);
#else
      ar = *(a + i * COMPSIZE + 0);
      ai = *(a + i * COMPSIZE + 1);
      xr = *(x + i * COMPSIZE + 0);
      xi = *(x + i * COMPSIZE + 1);

#if (TRANSA == 1) || (TRANSA == 2)
      *(y + i * COMPSIZE + 0) += ar * xr - ai * xi;
      *(y + i * COMPSIZE + 1) += ar * xi + ai * xr;
#else
      *(y + i * COMPSIZE + 0) += ar * xr + ai * xi;
      *(y + i * COMPSIZE + 1) += ar * xi - ai * xr;
#endif
#endif
#endif

#ifdef LOWER
      if (args -> m > i + 1) {
#ifndef TRANS
	MYAXPY(args -> m - i - 1, 0, 0,
		*(x + i * COMPSIZE + 0),
#ifdef COMPLEX
		*(x + i * COMPSIZE + 1),
#endif
		a + (i + 1 ) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0);
#else

	result = MYDOT(args -> m - i - 1, a + (i + 1) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1);

#ifndef COMPLEX
	*(y + i * COMPSIZE + 0) += result;
#else
	*(y + i * COMPSIZE + 0) += CREAL(result);
	*(y + i * COMPSIZE + 1) += CIMAG(result);
#endif

#endif
      }
#endif

#ifndef LOWER
    a += (i + 1) * COMPSIZE;
#else
    a += (args -> m - i - 1) * COMPSIZE;
#endif

  }

  return 0;
}
Exemplo n.º 28
0
void CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT alpha_r, FLOAT alpha_i,
	  FLOAT *a, BLASLONG lda,
	  FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){

  BLASLONG i, offset_u, offset_l, start, end, length;
  FLOAT *X = x;
  FLOAT *Y = y;
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *bufferY    = gemvbuffer;
  FLOAT *bufferX    = gemvbuffer;
#ifdef TRANS
  FLOAT _Complex temp;
#endif

  if (incy != 1) {
    Y = bufferY;
    bufferX    = (FLOAT *)(((BLASLONG)bufferY + M * sizeof(FLOAT) * 2 + 4095) & ~4095);
    gemvbuffer = bufferX;
    COPY_K(M, y, incy, Y, 1);
  }

  if (incx != 1) {
    X = bufferX;
    gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + N * sizeof(FLOAT) * 2 + 4095) & ~4095);
    COPY_K(N, x, incx, X, 1);
  }

  offset_u = ku;
  offset_l = ku + m;

  for (i = 0; i < MIN(n, m + ku); i++) {

    start = MAX(offset_u, 0);
    end   = MIN(offset_l, ku + kl + 1);

    length  = end - start;

#ifndef TRANS
    ZAXPY(length, 0, 0,
#ifndef XCONJ
	  alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1],
	  alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1],
#else
	  alpha_r * X[i * 2 + 0] + alpha_i * X[i * 2 + 1],
	  alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1],
#endif
	  a + start * 2, 1, Y + (start - offset_u) * 2, 1, NULL, 0);
#else

#ifndef XCONJ
    temp = ZDOT(length, a + start * 2, 1, X + (start - offset_u) * 2, 1);
#else
    temp = ZDOT(length, X + (start - offset_u) * 2, 1, a + start * 2, 1);
#endif

#if !defined(XCONJ) || !defined(CONJ)
    Y[i * 2 + 0] += alpha_r * CREAL(temp) - alpha_i * CIMAG(temp);
    Y[i * 2 + 1] += alpha_i * CREAL(temp) + alpha_r * CIMAG(temp);
#else
    Y[i * 2 + 0] += alpha_r * CREAL(temp) + alpha_i * CIMAG(temp);
    Y[i * 2 + 1] += alpha_i * CREAL(temp) - alpha_r * CIMAG(temp);
#endif
#endif

    offset_u --;
    offset_l --;

    a += lda * 2;
  }

  if (incy != 1) {
    COPY_K(M, Y, 1, y, incy);
  }

  return;
}
Exemplo n.º 29
0
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){

  BLASLONG i;
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;
  BLASLONG length;
#if (TRANSA == 2) || (TRANSA == 4)
  OPENBLAS_COMPLEX_FLOAT temp;
#endif
#ifndef UNIT
  FLOAT atemp1, atemp2, btemp1, btemp2;
#endif

  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095);
    COPY_K(n, b, incb, buffer, 1);
  }

  for (i = 0; i < n; i++) {

#if (TRANSA == 1) || (TRANSA == 3)
    length  = i;
    if (length > k) length = k;

    if (length > 0) {
#if   TRANSA == 1
      AXPYU_K(length, 0, 0,
	      B[i * 2 + 0],  B[i * 2 + 1],
	      a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1, NULL, 0);
#else
      AXPYC_K(length, 0, 0,
	      B[i * 2 + 0],  B[i * 2 + 1],
	      a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1, NULL, 0);
#endif

    }
#endif

#ifndef UNIT
#if (TRANSA == 1) || (TRANSA == 3)
      atemp1 = a[k * 2 + 0];
      atemp2 = a[k * 2 + 1];
#else
      atemp1 = a[0];
      atemp2 = a[1];
#endif

      btemp1 = B[i * 2 + 0];
      btemp2 = B[i * 2 + 1];

#if (TRANSA == 1) || (TRANSA == 2)
      B[i * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2;
      B[i * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1;
#else
      B[i * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2;
      B[i * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1;
#endif
#endif

#if (TRANSA == 2) || (TRANSA == 4)
    length  = n - i - 1;
    if (length > k) length = k;

    if (length > 0) {
#if TRANSA == 2
      temp = DOTU_K(length, a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1);
#else
      temp = DOTC_K(length, a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1);
#endif

      B[i * 2 + 0] += CREAL(temp);
      B[i * 2 + 1] += CIMAG(temp);
    }
#endif

    a += lda * COMPSIZE;
  }

  if (incb != 1) {
    COPY_K(n, buffer, 1, b, incb);
  }

  return 0;
}
Exemplo n.º 30
0
static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){

  FLOAT *a, *x, *y;

  BLASLONG lda, incx;
  BLASLONG m_from, m_to;
  BLASLONG i, is, min_i;

#ifdef TRANS
#ifndef COMPLEX
  FLOAT          result;
#else
  OPENBLAS_COMPLEX_FLOAT result;
#endif
#endif

#if defined(COMPLEX) && !defined(UNIT)
  FLOAT ar, ai, xr, xi;
#endif

  a = (FLOAT *)args -> a;
  x = (FLOAT *)args -> b;
  y = (FLOAT *)args -> c;

  lda  = args -> lda;
  incx = args -> ldb;

  m_from = 0;
  m_to   = args -> m;

  if (range_m) {
    m_from = *(range_m + 0);
    m_to   = *(range_m + 1);
  }

  if (incx != 1) {

#ifndef LOWER
    COPY_K(m_to, x, incx, buffer, 1);
#else
    COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1);
#endif

    x = buffer;
    buffer += ((COMPSIZE * args -> m + 3) & ~3);
  }

#ifndef TRANS
  if (range_n) y += *range_n * COMPSIZE;

#ifndef LOWER
  SCAL_K(m_to, 0, 0, ZERO,
#ifdef COMPLEX
	 ZERO,
#endif
	 y, 1, NULL, 0, NULL, 0);
#else
  SCAL_K(args -> m - m_from, 0, 0, ZERO,
#ifdef COMPLEX
	 ZERO,
#endif
	 y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);
#endif

#else

  SCAL_K(m_to - m_from, 0, 0, ZERO,
#ifdef COMPLEX
	 ZERO,
#endif
	 y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0);

#endif

  for (is = m_from; is < m_to; is += DTB_ENTRIES){

    min_i = MIN(m_to - is, DTB_ENTRIES);

#ifndef LOWER
    if (is > 0){
      MYGEMV(is, min_i, 0,
	     ONE,
#ifdef COMPLEX
	     ZERO,
#endif
	     a + is * lda * COMPSIZE, lda,
#ifndef TRANS
	     x + is * COMPSIZE, 1,
	     y,                 1,
#else
	     x,                 1,
	     y + is * COMPSIZE, 1,
#endif
	     buffer);
    }
#endif

    for (i = is; i < is + min_i; i++) {

#ifndef LOWER
      if (i - is > 0) {
#ifndef TRANS
	MYAXPY(i - is, 0, 0,
		*(x + i * COMPSIZE + 0),
#ifdef COMPLEX
		*(x + i * COMPSIZE + 1),
#endif
		a + (is + i * lda) * COMPSIZE, 1, y + is * COMPSIZE, 1, NULL, 0);
#else

	result = MYDOT(i - is,  a + (is + i * lda) * COMPSIZE, 1, x + is * COMPSIZE, 1);

#ifndef COMPLEX
	*(y + i * COMPSIZE + 0) += result;
#else
	*(y + i * COMPSIZE + 0) += CREAL(result);
	*(y + i * COMPSIZE + 1) += CIMAG(result);
#endif

#endif
      }
#endif

#ifndef COMPLEX
#ifdef UNIT
      *(y + i * COMPSIZE) += *(x + i * COMPSIZE);
#else
      *(y + i * COMPSIZE) += *(a + (i + i * lda) * COMPSIZE) * *(x + i * COMPSIZE);
#endif
#else
#ifdef UNIT
      *(y + i * COMPSIZE + 0) += *(x + i * COMPSIZE + 0);
      *(y + i * COMPSIZE + 1) += *(x + i * COMPSIZE + 1);
#else
      ar = *(a + (i + i * lda) * COMPSIZE + 0);
      ai = *(a + (i + i * lda) * COMPSIZE + 1);
      xr = *(x +  i            * COMPSIZE + 0);
      xi = *(x +  i            * COMPSIZE + 1);

#if (TRANSA == 1) || (TRANSA == 2)
      *(y + i * COMPSIZE + 0) += ar * xr - ai * xi;
      *(y + i * COMPSIZE + 1) += ar * xi + ai * xr;
#else
      *(y + i * COMPSIZE + 0) += ar * xr + ai * xi;
      *(y + i * COMPSIZE + 1) += ar * xi - ai * xr;
#endif
#endif
#endif

#ifdef LOWER
      if (is + min_i > i + 1) {
#ifndef TRANS
	MYAXPY(is + min_i - i - 1, 0, 0,
		*(x + i * COMPSIZE + 0),
#ifdef COMPLEX
		*(x + i * COMPSIZE + 1),
#endif
		a + (i + 1 + i * lda) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0);
#else

	result = MYDOT(is + min_i - i - 1, a + (i + 1 + i * lda) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1);

#ifndef COMPLEX
	*(y + i * COMPSIZE + 0) += result;
#else
	*(y + i * COMPSIZE + 0) += CREAL(result);
	*(y + i * COMPSIZE + 1) += CIMAG(result);
#endif

#endif
      }
#endif
    }

#ifdef LOWER
    if (args -> m >  is + min_i){
      MYGEMV(args -> m - is - min_i, min_i, 0,
	     ONE,
#ifdef COMPLEX
	     ZERO,
#endif
	     a + (is + min_i + is * lda) * COMPSIZE, lda,
#ifndef TRANS
	     x +  is          * COMPSIZE, 1,
	     y + (is + min_i) * COMPSIZE, 1,
#else
	     x + (is + min_i) * COMPSIZE, 1,
	     y +  is          * COMPSIZE, 1,
#endif
	     buffer);
    }
#endif
  }

  return 0;
}