Exemple #1
0
void rsort(int *r, int *a, int *b, int n, int m) {
    rep(i, n)
    wv[i] = r[a[i]];
    rep(i, m)
    ww[i] = 0;
    rep(i, n)
    ww[wv[i]]++;
    rep(i, m - 1)
    ww[i + 1] += ww[i];
    DEP(i, n - 1, 0)
    b[--ww[wv[i]]] = a[i];
}
Exemple #2
0
void MpiDataCache::invalidateData(Data* d, int exceptOnNode) {
  auto it = find(d);
  if (it->second[rank] && (rank != d->rank) && (rank != exceptOnNode)) {
    TaskScheduler::getInstance().insertTask(new DeallocateDataTask(d),
                                            DEPS(1, DEP(d, WRITE)),
                                            Priority::HIGH);
  }
  bool tmp;
  if (exceptOnNode != -1) {
    tmp = it->second[exceptOnNode];
  }
  it->second.assign(size, false);
  it->second[d->rank] = true;
  if (exceptOnNode != -1) {
    it->second[exceptOnNode] = tmp;
  }
}
Exemple #3
0
/***************************************************************************//**
 *  Parallel Reduction from BAND tridiagonal to the final condensed form - dynamic scheduler
 **/
void plasma_pdsbrdt_quark(PLASMA_enum uplo,
                          PLASMA_desc A, double *D, double *E, PLASMA_desc T,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

#ifdef COMPLEX
    static double zone  = (double) 1.0;
    static double             dzero = (double) 0.0;
    double ztmp;
    double absztmp;
#endif

    double *C, *S;
    int N, NB, INgrsiz, INthgrsiz, BAND;
    int myid, grsiz, shift=3, stt, st, ed, stind, edind;
    int blklastind, colpt, PCOL, ACOL, MCOL;
    int stepercol, mylastid, grnb, grid;
    int *DEP,*MAXID;
    int i, j, m;
    int thgrsiz, thgrnb, thgrid, thed;
    size_t eltsize = plasma_element_size(A.dtyp);

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;

    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    N  = A.m;
    NB = A.mb;

    /* Quick return */
    if (N == 0){
        return;
    }
    
    if (NB == 0) {
        memset(D, 0,     N*sizeof(double));
        memset(E, 0, (N-1)*sizeof(double));
#ifdef COMPLEX
        for (i=0; i<N; i++)
            D[i]  = fabs(*A(i,i));
#else
        for (i=0; i<N; i++)
          D[i]  = *A(i,i);
#endif
        return;
    }

    /*
     * Barrier is used because the bulge have to wait until
     * the reduction to band has been finish.
     * otherwise, I can remove this BARRIER when I integrate
     * the function dependencies link inside the reduction to
     * band. Keep in min the case when NB=1, where no bulge-chasing.
     */
    /***************************************************************/
    QUARK_Barrier(plasma->quark);
    tblg   = -Wtimming();
    /***************************************************************/

    /*
     * Case NB=1 ==> matrix is already Bidiagonal. no need to bulge.
     * Make diagonal and superdiagonal elements real, storing them in
     * D and E. if PlasmaLower, first transform lower bidiagonal form
     * to upper bidiagonal by applying plane rotations/ Householder
     * from the left, overwriting superdiagonal elements then make
     * elements real of the resulting upper Bidiagonal. if PlasmaUpper
     * then make its elements real.  For Q, PT: ZSCAL should be done
     * in case of WANTQ.
     */
    if (NB == 1){
        memset(D, 0, N    *sizeof(double));
        memset(E, 0, (N-1)*sizeof(double));
#ifdef COMPLEX
        if(uplo==PlasmaLower){
            for (i=0; i<N; i++)
            {
                D[i] = ( *A(i, i) );               /* diag value */
                if( i < (N-1)) {                            /* lower off-diag value */
                    ztmp        = *A((i+1),i);
                    absztmp     = fabs(ztmp);
                    *A((i+1),i) = absztmp;
                    E[i]        = absztmp;
                    if(absztmp != dzero)
                        ztmp = (double) (ztmp / absztmp);
                    else
                        ztmp = zone;
                    if(i<(N-2)) *A((i+2),(i+1)) = *A((i+2),(i+1)) * ztmp;
                    /* for Q: ZSCAL should be done in case of WANTQ */
                }
            }
        } else { /* PlasmaUpper */
            for (i=0; i<N; i++)
            {
                D[i]  =  ( *A(i,i) );               /* diag value*/
                if(i<(N-1)) {                            /* lower off-diag value */
                    ztmp        = *A(i, (i+1));
                    absztmp     = fabs(ztmp);
                    *A(i,(i+1)) = absztmp;
                    E[i]        = absztmp;
                    if(absztmp != dzero)
                        ztmp = (double) (ztmp / absztmp);
                    else
                        ztmp = zone;
                    if(i<(N-2)) *A((i+1),(i+2)) = *A((i+1),(i+2)) * ztmp;
                    /* for Q: ZSCAL should be done in case of WANTQ. HERE NEED THE multiply by CONJ(T) */
                }
            }
        } /* end PlasmaUpper*/
#else
       if( uplo == PlasmaLower ){
           for (i=0; i < N-1; i++) {
               D[i] = *A(i,   i);
               E[i] = *A(i+1, i);
           }
           D[i] = *A(i, i);
       } else {
           for (i=0; i < N-1; i++) {
               D[i] = *A(i, i  );
               E[i] = *A(i, i+1);
           }
           D[i] = *A(i, i);
       }
#endif
       return;
    }

    /* Case N<NB ==> matrix is very small and better to call lapack XHETRD. */
    if( N <= 0 ) /* this will be removed we don t need it. */
    {
        double *work, *TTau;
        int info, ldwork = N*N;
        work = (double *) plasma_shared_alloc(plasma, ldwork, PlasmaRealDouble);
        TTau = (double *) plasma_shared_alloc(plasma, N,   PlasmaRealDouble);
        
        info = LAPACKE_dsytrd_work(LAPACK_COL_MAJOR, lapack_const(uplo), N,
                                   A(0,0), A.lm, D, E, TTau, work, ldwork);
        plasma_shared_free(plasma, (void*) work);
        plasma_shared_free(plasma, (void*) TTau);

        if( info == 0 )
            sequence->status = PLASMA_SUCCESS;
        else
            plasma_sequence_flush(plasma->quark, sequence, request, info);
        return;
    }

    /* General case NB > 1 && N > NB */
    DEP   = (int *)                plasma_shared_alloc(plasma, N+1, PlasmaInteger      );
    MAXID = (int *)                plasma_shared_alloc(plasma, N+1, PlasmaInteger      );
    C     = (double *) plasma_shared_alloc(plasma, N,   PlasmaRealDouble);
    S     = (double *) plasma_shared_alloc(plasma, N,   PlasmaRealDouble);
    memset(MAXID,0,(N+1)*sizeof(int));

    /***************************************************************************
     *                       START BULGE CHASING CODE
     **************************************************************************/
    /* 
     * Initialisation of local parameter. those parameter should be
     * input or tuned parameter.
     */
    INgrsiz = 1;
    if( NB > 160 ) {
        INgrsiz = 2;
    }
    else if( NB > 100 ) {
        if( N < 5000 )
            INgrsiz = 2;
        else
            INgrsiz = 4;
    } else {
        INgrsiz = 6;
    }
    INthgrsiz = N;
    BAND      = 0;

    grsiz   = INgrsiz;
    thgrsiz = INthgrsiz;
    if( grsiz   == 0 ) grsiz   = 6;
    if( thgrsiz == 0 ) thgrsiz = N;

    i = shift/grsiz;
    stepercol =  i*grsiz == shift ? i:i+1;

    i       = (N-2)/thgrsiz;
    thgrnb  = i*thgrsiz == (N-2) ? i:i+1;

    for (thgrid = 1; thgrid<=thgrnb; thgrid++){
        stt  = (thgrid-1)*thgrsiz+1;
        thed = min( (stt + thgrsiz -1), (N-2));
        for (i = stt; i <= N-2; i++){
            ed=min(i,thed);
            if(stt>ed)break;
            for (m = 1; m <=stepercol; m++){
                st=stt;
                for (j = st; j <=ed; j++){
                    /* PCOL:  dependency on the ID of the master of the group of the previous column.  (Previous Column:PCOL). */
                    /* ACOL:  dependency on the ID of the master of the previous group of my column.   (Acctual  Column:ACOL). (it is 0(NULL) for myid=1) */
                    /* MCOL:  OUTPUT dependency on the my ID, to be used by the next ID. (My Column: MCOL). I am the master of this group. */
                    myid     = (i-j)*(stepercol*grsiz) +(m-1)*grsiz + 1;
                    mylastid = myid+grsiz-1;
                    PCOL     = mylastid+shift-1;  /* to know the dependent ID of the previous column. need to know the master of its group */
                    MAXID[j] = myid;
                    PCOL     = min(PCOL,MAXID[j-1]); /* for the last columns, we might do only 1 or 2 kernel, so the PCOL will be wrong. this is to force it to the last ID of the previous col.*/
                    grnb     = PCOL/grsiz;
                    grid     = grnb*grsiz == PCOL ? grnb:grnb+1;
                    PCOL     = (grid-1)*grsiz +1; /* give me the ID of the master of the group of the previous column. */
                    ACOL     = myid-grsiz;
                    if(myid==1)ACOL=0;
                    MCOL     = myid;
                    
                    QUARK_CORE_dtrdalg(
                        plasma->quark, &task_flags,
                        uplo, N, NB,
                        &A, C, S, i, j, m, grsiz, BAND,
                        DEP(PCOL), DEP(ACOL), DEP(MCOL) );
                    
                    if(mylastid%2 ==0){
                        blklastind      = (mylastid/2)*NB+1+j-1;
                    }else{
                        colpt      = ((mylastid+1)/2)*NB + 1 +j -1 ;
                        stind      = colpt-NB+1;
                        edind      = min(colpt,N);
                        if( (stind>=edind-1) && (edind==N) )
                            blklastind=N;
                        else
                            blklastind=0;
                    }
                    if(blklastind >= (N-1))  stt=stt+1;
                } /* END for j=st:ed    */
            } /* END for m=1:stepercol */
        } /* END for i=1:MINMN-2      */
    } /* END for thgrid=1:thgrnb     */
    
    /*
     * Barrier used only for now, to be sure that everything
     * is done before copying the D and E and free workspace.
     * this will be removed later when D and E are directly filled
     * during the bulge process.
     */
    QUARK_Barrier(plasma->quark);
    tblg   += Wtimming();
    //printf("   done with bulge %lf \n\n\n",tblg);

    plasma_shared_free(plasma, (void*) DEP);
    plasma_shared_free(plasma, (void*) MAXID);
    plasma_shared_free(plasma, (void*) C);
    plasma_shared_free(plasma, (void*) S);

    /*
     * STORE THE RESULTING diagonal/off-diagonal in D AND E
     */
    memset(D, 0,  N   *sizeof(double));
    memset(E, 0, (N-1)*sizeof(double));
    /* Make diagonal and superdiagonal elements real,
     * storing them in D and E
     */
    /* In complex case, the off diagonal element are
     * not necessary real. we have to make off-diagonal
     * elements real and copy them to E.
     * When using HouseHolder elimination,
     * the ZLARFG give us a real as output so, all the
     * diagonal/off-diagonal element except the last one are already
     * real and thus we need only to take the abs of the last
     * one.
     *  */
#ifdef COMPLEX
    if(uplo==PlasmaLower){
       for (i=0; i < N-1 ; i++)
       {
          D[i] = ( *A(i,i) );               
          /*
           * Alternative for Householder case, all off-diag
           * are real except the last off-diag, where we
           * have to take the abs
           */
          if(i<(N-2))
              E[i] = (*A(i+1, i));
          else
              E[i] = fabs( *A(i+1, i));
       }
        D[i] = ( *A(i, i) );
    } else { /* PlasmaUpper */
        for (i=0; i<N-1; i++)
        {
            D[i]  =  ( *A(i,i) );               
            /*
             * Alternative for Householder case, all off-diag
             * are real except the last off-diag, where we
             * have to take the abs
             */
            if( i < (N-2) )
                E[i] = (*A(i, (i+1)));
            else
                E[i] = fabs(*A(i, (i+1)));
        }
        D[i] = ( *A(i, i) );
    } /* end PlasmaUpper */
#else
    if( uplo == PlasmaLower ){
        for (i=0; i < N-1; i++) {
            D[i] = *A(i,   i);
            E[i] = *A(i+1, i);
        }
        D[i] = *A(i, i);
    } else {
        for (i=0; i < N-1; i++) {
            D[i] = *A(i, i  );
            E[i] = *A(i, i+1);
        }
        D[i] = *A(i, i);
    }
#endif

} /* END FUNCTION */
Exemple #4
0
//Pour générer le fichier .dot: dot -Tps graph.dot -o graph.ps
void Dfg::restitute(Node_dfg * node, string const filename, bool first){
  if(first)
    remove(filename.c_str());
  ofstream monflux(filename.c_str(), ios::app);
  list<Node_dfg*>::iterator it;
	 
  if(first && _length){
	     
    for(int i=0; i<_length; i++)
      _read[i]=0;
	      
    it = _delayed_slot.begin();
	      
    int index_min = _length;
	      
    for(unsigned int i=0; i < _delayed_slot.size(); i++){
      _read[(*it)->get_instruction()->get_index()] = 1;
      if (index_min > (*it)->get_instruction()->get_index())
	index_min = (*it)->get_instruction()->get_index();
      it++;
    }
	
    monflux<<"digraph G1 {"<<endl;
    for(int i=0; i<index_min; i++){
      monflux<<"i"<<i<< ";"<<endl;
		 
    }
  }	
  it=_roots.begin();
  for (unsigned int j=0; j<_roots.size();j++ ){ 		
	
    if(first) node = *it;
			
    if(monflux){			
      //monflux.open(filename.c_str(), ios::app);
      if(!_read[node->get_instruction()->get_index()]){
	_read[node->get_instruction()->get_index()]=1;
						
	//On affiche ses successeurs s'il en a
	for(int i=0; i<node->get_nb_arcs(); i++){
	  if (node->get_arc(i)){
		   
	    monflux<<"i"<<node->get_instruction()->get_index();
	    monflux<<" ->  i" << node->get_arc(i)->next->get_instruction()->get_index();
	
	    // monflux<<"i"<<node->get_instruction()->get_index()<<"_"<<node->get_weight();
	    // monflux<<" ->  i" << node->get_arc(i)->next->get_instruction()->get_index();
	    // monflux<<"_"<<node->get_arc(i)->next->get_weight();
	
	    monflux<<" [label= \""<< DEP(node->get_arc(i)->dep) << node->get_arc(i)->delai<<"\"];"<<endl;
	  }
	}
	monflux.close();
		
	for(int i=0;i<node->get_nb_arcs();i++){
	  if (node->get_arc(i))
	    restitute(node->get_arc(i)->next,filename.c_str(),false);	
	}
      }
    }
    if((j+1) < _roots.size())	monflux.open(filename.c_str(), ios::app);
    it++;
  }
	
  if (first && _length){
    monflux.open(filename.c_str(), ios::app);
    monflux<<"}"<<endl;
    monflux.close();
  }
  return;
	 
}