Ejemplo n.º 1
0
Archivo: gc.c Proyecto: aosm/graphviz
static void
wcp (int nnodes, int nedges, int ncc, int ncl, char* gname, char* fname)
{
  int i;

  if (silent) return;
  for (i = 0; i < n_indent; i++) fputs ("  ", outfile);
  if (flags & NODES) ipr (nnodes);
  if (flags & EDGES) ipr (nedges);
  if (flags & CC) ipr (ncc);
  if (flags & CL) ipr (ncl);
  if (fname) printf(" %s (%s)\n", gname, fname);
  else printf(" %s\n", gname);
}
Ejemplo n.º 2
0
// convert results in progress to IP_RESULTs,
// and get an initial schedule for them
//
void CLIENT_STATE::get_workload(vector<IP_RESULT>& ip_results) {
    for (unsigned int i=0; i<results.size(); i++) {
        RESULT* rp = results[i];
        double x = rp->estimated_runtime_remaining();
        if (x == 0) continue;
        IP_RESULT ipr(rp->name, rp->report_deadline-now, x);
        ip_results.push_back(ipr);
    }
    //init_ip_results(work_buf_min(), ncpus, ip_results);
    init_ip_results(0, ncpus, ip_results);
}
Ejemplo n.º 3
0
/***********************************************
 *Index of the nearest neghibour process in the 
 *-ve and +ve 4 directions.
 *
 * npr[2*mu] is the index of the neighbour process
 * in the -mu direction.
 * 
 * npr[2*mu+1] is the index of the neighbour process
 * in the +mu direction.
 ***********************************************/
void set_npr(void)
{
   int mu,n[4];

   for(mu=0; mu < 4; mu++)
     n[mu]=plqcd_g.cpr[mu];


   for(mu=0;mu<4;mu++)
   {
      n[mu] -= 1;
      if (n[mu] == -1)
         n[mu] += plqcd_g.nprocs[mu];  //periodic
      plqcd_g.npr[2*mu]=ipr(n);
      n[mu] = plqcd_g.cpr[mu];

      n[mu] += 1;
      n[mu] = n[mu]%plqcd_g.nprocs[mu]; //periodic
      plqcd_g.npr[2*mu+1]=ipr(n);
      n[mu] = plqcd_g.cpr[mu];
   }
}
Ejemplo n.º 4
0
void find_site(int xg[], int *pid, int *xeo)
{
    int xcor[4],proc[4];

    int i, mu,ilex,ieo,myproc;

    //check the input
    for(i=0; i<4; i++)
    {
       if( (xg[i] < 0) || (xg[i] >= plqcd_g.nprocs[i]*plqcd_g.latdims[i]) )
       {
          printf("Error, global site coordinates are out of bounds in the %d -th direction\n",i);
          exit(1);
       } 
    }


    for(mu=0; mu<4; mu++)
    {
        proc[mu] = xg[mu]/plqcd_g.latdims[mu];
        
        xcor[mu] = xg[mu] - proc[mu]*plqcd_g.latdims[mu];
    }

    ilex = cart2lex(xcor);

    ieo  = plqcd_g.ipt_eo[ilex];

    myproc=ipr(proc);

    (*pid) = myproc;

    (*xeo) = ieo;

    return ;
} 
Ejemplo n.º 5
0
int main(int argc, char **argv)
{
    //initialize plqcd
    int init_status;

    if(argc < 3) {
        fprintf(stderr,"Error. Must pass the name of the input file and the number of multiplications to be performed \n");
        fprintf(stderr,"Usage: %s input_file_name Nmul\n",argv[0]);
        exit(1);
    }

    init_status = init_plqcd(argc,argv);

    if(init_status != 0)
        printf("Error initializing plqcd\n");

    int proc_id;
    int i,j,k,Nmul;
    proc_id = ipr(plqcd_g.cpr);

    Nmul=atoi(argv[2]);

#if 0
    //Intialize the ranlux random number generator
    start_ranlux(0,1);
#endif

    int NPROCS=plqcd_g.nprocs[0]*plqcd_g.nprocs[1]*plqcd_g.nprocs[2]*plqcd_g.nprocs[3];

    char ofname[128];

    char buff[128];

    strcpy(ofname,"test_hopping_output.procgrid.");

    sprintf(buff,"%d-%d-%d-%d.nthreads.%d.proc.%d",plqcd_g.nprocs[0],plqcd_g.nprocs[1],plqcd_g.nprocs[2],plqcd_g.nprocs[3],plqcd_g.nthread,proc_id);



    strcat(ofname,buff);


    FILE *ofp;

    //FILE *ofp_source;

    //if(proc_id==0)
    //{
    //     ofp_source = fopen("test_rand_vals.out","w");
    //}

    if(proc_id==0)
    {
        ofp=fopen(ofname,"w");
        fprintf(ofp,"INPUT GLOBALS:\n");
        fprintf(ofp,"----------------\n");
        fprintf(ofp,"NPROC0 %d, NPROC1 %d, NPROC2 %d, NPROC3 %d, NTHREAD %d\n",plqcd_g.nprocs[0],plqcd_g.nprocs[1],plqcd_g.nprocs[2],plqcd_g.nprocs[3], plqcd_g.nthread);
        fprintf(ofp,"L0 %d, L1 %d, L2 %d, L3 %d\n\n",plqcd_g.latdims[0],plqcd_g.latdims[1],plqcd_g.latdims[2],plqcd_g.latdims[3]);
        //printf("sizeof(spinor) %ld, sizeof(halfspinor) %ld, sizeof(su3) %ld \n",sizeof(spinor),sizeof(halfspinor),sizeof(su3));
    }


    int nthr;
#ifdef _OPENMP
    #pragma omp parallel
    {
        nthr=omp_get_num_threads();
        if(omp_get_thread_num() == 0)
            if(proc_id==0)
                fprintf(ofp,"Number of threads as returned by openmp %d\n",nthr);
    }
#endif


    /*****************************************************
     *Testing the Dirac operator interface
     ****************************************************/




    spinor *pin= (spinor *) amalloc(plqcd_g.VOLUME*sizeof(spinor), plqcd_g.ALIGN);
    if(pin==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for spinor pin.\n");
        exit(2);
    }

    spinor *pout= (spinor *) amalloc(plqcd_g.VOLUME*sizeof(spinor), plqcd_g.ALIGN);
    if(pout==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for spinor pout.\n");
        exit(2);
    }

    su3 *ufield= (su3 *) amalloc(4*plqcd_g.VOLUME*sizeof(su3), plqcd_g.ALIGN);
    if(ufield==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for gauge field ufield.\n");
        exit(2);
    }


    //256 arrays
#ifdef AVX
    spinor_256 *pin_256= (spinor_256 *) amalloc(plqcd_g.VOLUME/2*sizeof(spinor_256), plqcd_g.ALIGN);
    if(pin_256==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for spinor pin_256.\n");
        exit(2);
    }


    spinor_256 *pout_256= (spinor_256 *) amalloc(plqcd_g.VOLUME/2*sizeof(spinor_256), plqcd_g.ALIGN);
    if(pout_256==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for spinor pout_256.\n");
        exit(2);
    }


    su3_256 *ufield_256= (su3_256 *) amalloc(4*plqcd_g.VOLUME/2*sizeof(su3_256), plqcd_g.ALIGN);

    if(ufield_256==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for gauge field ufield_256.\n");
        exit(2);
    }
#endif


    //512 arrays
#ifdef MIC
    spinor_512 *pin_512= (spinor_512 *) amalloc(plqcd_g.VOLUME/4*sizeof(spinor_512), plqcd_g.ALIGN);
    if(pin_512==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for spinor pin_512.\n");
        exit(2);
    }


    spinor_512 *pout_512= (spinor_512 *) amalloc(plqcd_g.VOLUME/4*sizeof(spinor_512), plqcd_g.ALIGN);
    if(pout_512==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for spinor pout_512.\n");
        exit(2);
    }


    su3_512 *ufield_512= (su3_512 *) amalloc(4*plqcd_g.VOLUME/4*sizeof(su3_512), plqcd_g.ALIGN);

    if(ufield_512==NULL)
    {
        fprintf(stderr,"ERROR: insufficient memory for gauge field ufield_512.\n");
        exit(2);
    }
#endif





    //intialize the random number generator by a seed equals to the process rank
    srand((unsigned int) proc_id);


    //Initialize the input spinor and gauge links to random numbers



    //intialize the random number generator by a seed equals to the process rank
    srand((unsigned int) proc_id);


    //Initialize the input spinor and gauge links to random numbers
    double ru[18];
    double rs[24];

    for(i=0; i<plqcd_g.VOLUME; i++)
    {
        //ranlxd(rs,24);
        for(j=0; j<24; j++)
        {
            rs[j]= rand() / (double)RAND_MAX;
            //fprintf(stderr,"rs[%d]=%lf\n",j,rs[j]);
        }

        pin[i].s0.c0=rs[0]+I*rs[1];
        pin[i].s0.c1=rs[2]+I*rs[3];
        pin[i].s0.c2=rs[4]+I*rs[5];
        pin[i].s1.c0=rs[6]+I*rs[7];
        pin[i].s1.c1=rs[8]+I*rs[9];
        pin[i].s1.c2=rs[10]+I*rs[11];
        pin[i].s2.c0=rs[12]+I*rs[13];
        pin[i].s2.c1=rs[14]+I*rs[15];
        pin[i].s2.c2=rs[16]+I*rs[17];
        pin[i].s3.c0=rs[18]+I*rs[19];
        pin[i].s3.c1=rs[20]+I*rs[21];
        pin[i].s3.c2=rs[22]+I*rs[23];


        //ranlxd(rs,24);
        for(j=0; j<24; j++)
            rs[j]= rand() / (double)RAND_MAX;

        pout[i].s0.c0=rs[0]+I*rs[1];
        pout[i].s0.c1=rs[2]+I*rs[3];
        pout[i].s0.c2=rs[4]+I*rs[5];
        pout[i].s1.c0=rs[6]+I*rs[7];
        pout[i].s1.c1=rs[8]+I*rs[9];
        pout[i].s1.c2=rs[10]+I*rs[11];
        pout[i].s2.c0=rs[12]+I*rs[13];
        pout[i].s2.c1=rs[14]+I*rs[15];
        pout[i].s2.c2=rs[16]+I*rs[17];
        pout[i].s3.c0=rs[18]+I*rs[19];
        pout[i].s3.c1=rs[20]+I*rs[21];
        pout[i].s3.c2=rs[22]+I*rs[23];

        for(j=0; j<4; j++)
        {
            //ranlxd(ru,18);
            for(k=0; k<18; k++)
            {
                ru[k]= rand() / (double)RAND_MAX;
                //fprintf(stderr,"ru[%d]=%lf\n",k,ru[k]);
            }


            ufield[4*i+j].c00=ru[0]+I*ru[1];
            ufield[4*i+j].c01=ru[2]+I*ru[3];
            ufield[4*i+j].c02=ru[4]+I*ru[5];
            ufield[4*i+j].c10=ru[6]+I*ru[7];
            ufield[4*i+j].c11=ru[8]+I*ru[9];
            ufield[4*i+j].c12=ru[10]+I*ru[11];
            ufield[4*i+j].c20=ru[12]+I*ru[13];
            ufield[4*i+j].c21=ru[14]+I*ru[15];
            ufield[4*i+j].c22=ru[16]+I*ru[17];
        }

    }

#ifdef AVX
    for(i=0; i<plqcd_g.VOLUME; i +=2)
    {
        for(j=0; j<4; j++)
            copy_su3_to_su3_256(ufield_256+4*i/2+j, ufield+4*i+j, ufield+4*(i+1)+j);

        copy_spinor_to_spinor_256(pin_256+i/2, pin+i, pin+i+1);
        copy_spinor_to_spinor_256(pout_256+i/2, pout+i, pout+i+1);
    }
#endif

#ifdef MIC
    for(i=0; i<plqcd_g.VOLUME; i +=4)
    {
        for(j=0; j<4; j++)
            copy_su3_to_su3_512(ufield_512+4*i/4+j, ufield+4*i+j, ufield+4*(i+1)+j, ufield+4*(i+2)+j, ufield+4*(i+3)+j);

        copy_spinor_to_spinor_512(pin_512+i/4, pin+i, pin+i+1, pin+i+2, pin+i+3);
        copy_spinor_to_spinor_512(pout_512+i/4, pout+i, pout+i+1, pout+i+2, pout+i+3);
    }
#endif


    double total,t1=0.0,t2=0.0,mytotal;
    int  matvecs;


#ifdef ASSYMBLY
    //---------------------------------------------
    //1: non-blocking assymbly/c version
    //---------------------------------------------
    matvecs=0;
    total=0.0;
    mytotal =0.0;

    while(mytotal < 30)
    {
        MPI_Barrier(MPI_COMM_WORLD);
        for(i=0; i<Nmul; i++)
        {
            t1=plqcd_hopping_matrix_eo_sse3_assymbly(pin,pout,ufield);
            t2=plqcd_hopping_matrix_oe_sse3_assymbly(pin,pout,ufield);
            mytotal += t1+t2;
        }
        matvecs += Nmul;
    }

    MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD);
    MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD);


    if (proc_id==0)
    {
        total /= (double)(NPROCS);
    }


    if(proc_id==0)
    {
        fprintf(ofp,"non-blocking assymbly/c version:\n");
        fprintf(ofp,"------------------------------------------\n");
        fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n",
                matvecs,total,matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6);
    }
#endif


#ifdef SSE3_INTRIN
    //---------------------------------------------
    //1: non-blocking sse3 with intrinsics version
    //---------------------------------------------
    matvecs=0;
    total=0.0;
    mytotal =0.0;

    while(mytotal < 30)
    {
        MPI_Barrier(MPI_COMM_WORLD);
        for(i=0; i<Nmul; i++)
        {
            t1=plqcd_hopping_matrix_eo_sse3_intrin(pin,pout,ufield);
            t2=plqcd_hopping_matrix_oe_sse3_intrin(pin,pout,ufield);
            mytotal += t1+t2;
        }
        matvecs += Nmul;
    }

    MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD);
    MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD);


    if (proc_id==0)
    {
        total /= (double)(NPROCS);
    }


    if(proc_id==0)
    {
        fprintf(ofp,"non-blocking sse3 with intrinsics version:\n");
        fprintf(ofp,"------------------------------------------\n");
        fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n",
                matvecs,total,matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6);
    }



    //---------------------------------------------
    //2: blocking sse3 with intrinsics version
    //---------------------------------------------
    matvecs=0;
    total=0.0;
    mytotal =0.0;

    while(mytotal < 30)
    {
        MPI_Barrier(MPI_COMM_WORLD);
        for(i=0; i<Nmul; i++)
        {
            t1=plqcd_hopping_matrix_eo_sse3_intrin_blocking(pin,pout,ufield);
            t2=plqcd_hopping_matrix_oe_sse3_intrin_blocking(pin,pout,ufield);
            mytotal += t1+t2;
        }
        matvecs += Nmul;
    }

    MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD);
    MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD);


    if (proc_id==0)
    {
        total /= (double)(NPROCS);
    }


    if(proc_id==0)
    {
        fprintf(ofp,"blocking sse3 with intrinsics version:\n");
        fprintf(ofp,"------------------------------------------\n");
        fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n",
                matvecs,total,matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6);
    }
#endif


#ifdef AVX
    //---------------------------------------------
    //2: avx version
    //---------------------------------------------
    matvecs=0;
    total=0.0;
    mytotal =0.0;

    t1=plqcd_hopping_matrix_eo_intrin_256(pin_256,pout_256,ufield_256);
    while(mytotal < 30)
    {
        MPI_Barrier(MPI_COMM_WORLD);
        for(i=0; i<Nmul; i++)
        {
            t1=plqcd_hopping_matrix_eo_intrin_256(pin_256,pout_256,ufield_256);
            t2=plqcd_hopping_matrix_oe_intrin_256(pin_256,pout_256,ufield_256);
            mytotal += t1+t2;
        }
        matvecs += Nmul;
    }

    MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD);
    MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD);


    if (proc_id==0)
    {
        total /= (double)(NPROCS);
    }


    if(proc_id==0)
    {
        fprintf(ofp,"avxversion:\n");
        fprintf(ofp,"------------------------------------------\n");
        fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n",
                matvecs,total,matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6);
    }
#endif


#ifdef MIC

#ifdef TEST_HOPPING_MIC
    //---------------------------------------------
    //3: MIC version full su3 matrix
    //---------------------------------------------
    matvecs=0;
    total=0.0;
    mytotal =0.0;

    t1=plqcd_hopping_matrix_eo_single_mic(pin_512,pout_512,ufield_512);

    while(mytotal < 30)
    {
        MPI_Barrier(MPI_COMM_WORLD);
        for(i=0; i<Nmul; i++)
        {
            //t1=plqcd_hopping_matrix_eo_intrin_512(pin_512,pout_512,ufield_512);
            //t2=plqcd_hopping_matrix_oe_intrin_512(pin_512,pout_512,ufield_512);
            t1=plqcd_hopping_matrix_eo_single_mic(pin_512,pout_512,ufield_512);
            t2=plqcd_hopping_matrix_eo_single_mic(pin_512,pout_512,ufield_512);
            mytotal += t1+t2;
        }
        matvecs += 2*Nmul;
    }

    MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD);
    MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD);


    if (proc_id==0)
    {
        total /= (double)(NPROCS);
    }


    if(proc_id==0)
    {
        fprintf(ofp,"mic version, 3x3 links:\n");
        fprintf(ofp,"------------------------------------------\n");
        fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n",
                matvecs,total,(double )matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6);
    }


    //---------------------------------------------
    //3: MIC version full reduced su3 storage
    //---------------------------------------------
    matvecs=0;
    total=0.0;
    mytotal =0.0;

    t1=plqcd_hopping_matrix_eo_single_mic_short(pin_512,pout_512,ufield_512);

    while(mytotal < 30)
    {
        MPI_Barrier(MPI_COMM_WORLD);
        for(i=0; i<Nmul; i++)
        {
            //t1=plqcd_hopping_matrix_eo_intrin_512(pin_512,pout_512,ufield_512);
            //t2=plqcd_hopping_matrix_oe_intrin_512(pin_512,pout_512,ufield_512);
            t1=plqcd_hopping_matrix_eo_single_mic_short(pin_512,pout_512,ufield_512);
            t2=plqcd_hopping_matrix_eo_single_mic_short(pin_512,pout_512,ufield_512);
            mytotal += t1+t2;
        }
        matvecs += 2*Nmul;
    }

    MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD);
    MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD);


    if (proc_id==0)
    {
        total /= (double)(NPROCS);
    }


    if(proc_id==0)
    {
        fprintf(ofp,"mic version, 2x3 links:\n");
        fprintf(ofp,"------------------------------------------\n");
        fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n",
                matvecs,total,(double )matvecs*plqcd_g.VOLUME/2.0*1200/total/1e+6);
    }

#endif

#ifdef TEST_SU3MUL_MIC

    matvecs=0;
    total=0.0;
    mytotal =0.0;

    //while(mytotal < 10)
    //{
    MPI_Barrier(MPI_COMM_WORLD);
    for(i=0; i<Nmul; i++)
    {
        t1=stop_watch(0.0);

#ifdef _OPENMP
        #pragma omp parallel
        {
#endif
            __m512d U[3][3], gin[3],gout[3];
            su3_512 *u0;
            su3_vector_512 *hin,*hout;
#ifdef _OPENMP
            #pragma omp for
#endif
            for(j=0; j< plqcd_g.VOLUME/4; j++)
            {
                u0  = &ufield_512[4*j];
                hin = &pin_512[j].s0;
                hout= &pout_512[j].s0;

                intrin_su3_load_512(U,u0);
                intrin_vector_load_512(gin,hin);
                intrin_su3_multiply_512(gout,U,gin);
                intrin_vector_store_512(hout,gout);

                u0++;
                hin++;
                hout++;

                intrin_su3_load_512(U,u0);
                intrin_vector_load_512(gin,hin);
                intrin_su3_multiply_512(gout,U,gin);
                intrin_vector_store_512(hout,gout);
                u0++;
                hin++;
                hout++;

                intrin_su3_load_512(U,u0);
                intrin_vector_load_512(gin,hin);
                intrin_su3_multiply_512(gout,U,gin);
                intrin_vector_store_512(hout,gout);
                u0++;
                hin++;
                hout++;

                intrin_su3_load_512(U,u0);
                intrin_vector_load_512(gin,hin);
                intrin_su3_multiply_512(gout,U,gin);
                intrin_vector_store_512(hout,gout);

            }
#ifdef _OPENMP
        }
#endif

        t2 = stop_watch(t1);
        mytotal += t2;
    }
    matvecs += 4*Nmul*plqcd_g.VOLUME;
    //}

    MPI_Reduce(&mytotal,&total,1,MPI_DOUBLE,MPI_SUM,0, MPI_COMM_WORLD);
    MPI_Bcast(&total,1,MPI_DOUBLE,0, MPI_COMM_WORLD);

    if (proc_id==0)
    {
        total /= (double)(NPROCS);
    }


    if(proc_id==0)
    {
        fprintf(ofp,"su3mul mic version:\n");
        fprintf(ofp,"------------------------------------------\n");
        fprintf(ofp,"test_hopping\tmult\t%d\ttotal(sec)\t%lf\tMFlops/process\t%lf\n",
                matvecs,total,matvecs*66.0/total/1e+6);
    }
#endif

#endif //MIC

    finalize_plqcd();

    return 0;
}