示例#1
0
//##################################################################################################
static void *magma_capplyQ_parallel_section(void *arg)
{

    magma_int_t my_core_id   = ((magma_capplyQ_id_data*)arg) -> id;
    magma_capplyQ_data* data = ((magma_capplyQ_id_data*)arg) -> data;

    magma_int_t allcores_num   = data -> threads_num;
    magma_int_t n              = data -> n;
    magma_int_t ne             = data -> ne;
    magma_int_t n_gpu          = data -> n_gpu;
    magma_int_t nb             = data -> nb;
    magma_int_t Vblksiz        = data -> Vblksiz;
    magmaFloatComplex *E         = data -> E;
    magma_int_t lde            = data -> lde;
    magmaFloatComplex *V         = data -> V;
    magma_int_t ldv            = data -> ldv;
    magmaFloatComplex *TAU       = data -> TAU;
    magmaFloatComplex *T         = data -> T;
    magma_int_t ldt            = data -> ldt;
    magmaFloatComplex *dE        = data -> dE;
    magma_int_t ldde           = data -> ldde;
    pthread_barrier_t* barrier = &(data -> barrier);

    magma_int_t info;

#ifdef ENABLE_TIMER
    real_Double_t timeQcpu=0.0, timeQgpu=0.0;
#endif

    magma_int_t n_cpu = ne - n_gpu;

    // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads
    // it need that all threads setting it to 1.
    magma_setlapack_numthreads(1);

#ifdef MAGMA_SETAFFINITY
    //#define PRINTAFFINITY
#ifdef PRINTAFFINITY
    affinity_set print_set;
    print_set.print_affinity(my_core_id, "starting affinity");
#endif
    affinity_set original_set;
    affinity_set new_set(my_core_id);
    int check  = 0;
    int check2 = 0;
    // bind threads
    check = original_set.get_affinity();
    if (check == 0) {
        check2 = new_set.set_affinity();
        if (check2 != 0)
            printf("Error in sched_setaffinity (single cpu)\n");
    }
    else
    {
        printf("Error in sched_getaffinity\n");
    }
#ifdef PRINTAFFINITY
    print_set.print_affinity(my_core_id, "set affinity");
#endif
#endif

    if(my_core_id==0)
    {
        //=============================================
        //   on GPU on thread 0:
        //    - apply V2*Z(:,1:N_GPU)
        //=============================================
#ifdef ENABLE_TIMER
        timeQgpu = magma_wtime();
#endif

        magma_csetmatrix(n, n_gpu, E, lde, dE, ldde);
        magma_cbulge_applyQ_v2('L', n_gpu, n, nb, Vblksiz, dE, ldde, V, ldv, T, ldt, &info);
        magma_device_sync();

#ifdef ENABLE_TIMER
        timeQgpu = magma_wtime()-timeQgpu;
        printf("  Finish Q2_GPU GGG timing= %f \n" ,timeQgpu);
#endif
    } else {
        //=============================================
        //   on CPU on threads 1:allcores_num-1:
        //    - apply V2*Z(:,N_GPU+1:NE)
        //=============================================
#ifdef ENABLE_TIMER
        if(my_core_id == 1)
            timeQcpu = magma_wtime();
#endif

        magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1);
        magmaFloatComplex* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde;
        n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1));

        magma_ctile_bulge_applyQ(my_core_id, 'L', n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt);
        pthread_barrier_wait(barrier);

#ifdef ENABLE_TIMER
        if(my_core_id == 1) {
            timeQcpu = magma_wtime()-timeQcpu;
            printf("  Finish Q2_CPU CCC timing= %f \n" ,timeQcpu);
        }
#endif

    } // END if my_core_id

#ifdef MAGMA_SETAFFINITY
    // unbind threads
    if (check == 0) {
        check2 = original_set.set_affinity();
        if (check2 != 0)
            printf("Error in sched_setaffinity (restore cpu list)\n");
    }
#ifdef PRINTAFFINITY
    print_set.print_affinity(my_core_id, "restored_affinity");
#endif
#endif

    return 0;
}
示例#2
0
extern "C" magma_int_t
magma_cbulge_back(magma_int_t threads, char uplo,
                  magma_int_t n, magma_int_t nb,
                  magma_int_t ne, magma_int_t Vblksiz,
                  magmaFloatComplex *Z, magma_int_t ldz,
                  magmaFloatComplex *dZ, magma_int_t lddz,
                  magmaFloatComplex *V, magma_int_t ldv,
                  magmaFloatComplex *TAU,
                  magmaFloatComplex *T, magma_int_t ldt,
                  magma_int_t* info)
{
    magma_setlapack_numthreads(1);

    float timeaplQ2=0.0;
    float f= 1.;
    magma_int_t n_gpu = ne;

//#if defined(PRECISION_s) || defined(PRECISION_d)
    //float gpu_cpu_perf = 50;  // gpu over cpu performance  //100% ev // SandyB. - Kepler (K20c)
    //float gpu_cpu_perf = 16;  // gpu over cpu performance  //100% ev // SandyB. - Fermi (M2090)
//#else
//    float gpu_cpu_perf = 27.5;  // gpu over cpu performance  //100% ev // Westmere - Fermi (M2090)
    //float gpu_cpu_perf = 37;  // gpu over cpu performance  //100% ev // SandyB. - Kepler (K20c)
//    float gpu_cpu_perf = 130;  // gpu over cpu performance  //100% ev // Bulldozer - Kepler (K20X)
//#endif

    magma_int_t gpu_cpu_perf = magma_get_cbulge_gcperf();
    if(threads>1) {
        f = 1. / (1. + (float)(threads-1)/ ((float)gpu_cpu_perf)    );
        n_gpu = (magma_int_t)(f*ne);
    }

    /****************************************************
     *  apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z
     * **************************************************/
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//n_gpu=ne;
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
    timeaplQ2 = magma_wtime();
    /*============================
     *  use GPU+CPU's
     *==========================*/

    if(n_gpu < ne)
    {

        // define the size of Q to be done on CPU's and the size on GPU's
        // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N)
#ifdef ENABLE_DEBUG
        printf("---> calling GPU + CPU(if N_CPU>0) to apply V2 to Z with NE %d     N_GPU %d   N_CPU %d\n",ne, n_gpu, ne-n_gpu);
#endif
        magma_capplyQ_data data_applyQ(threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt, dZ, lddz);

        magma_capplyQ_id_data* arg;
        magma_malloc_cpu((void**) &arg, threads*sizeof(magma_capplyQ_id_data));

        pthread_t* thread_id;
        magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t));

        pthread_attr_t thread_attr;

        // ===============================
        // relaunch thread to apply Q
        // ===============================
        // Set one thread per core
        pthread_attr_init(&thread_attr);
        pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM);
        pthread_setconcurrency(threads);

        // Launch threads
        for (magma_int_t thread = 1; thread < threads; thread++)
        {
            arg[thread] = magma_capplyQ_id_data(thread, &data_applyQ);
            pthread_create(&thread_id[thread], &thread_attr, magma_capplyQ_parallel_section, &arg[thread]);
        }
        arg[0] = magma_capplyQ_id_data(0, &data_applyQ);
        magma_capplyQ_parallel_section(&arg[0]);

        // Wait for completion
        for (magma_int_t thread = 1; thread < threads; thread++)
        {
            void *exitcodep;
            pthread_join(thread_id[thread], &exitcodep);
        }

        magma_free_cpu(thread_id);
        magma_free_cpu(arg);

        magma_csetmatrix(n, ne-n_gpu, Z + n_gpu*ldz, ldz, dZ + n_gpu*ldz, lddz);

        /*============================
         *  use only GPU
         *==========================*/
    } else {
        magma_csetmatrix(n, ne, Z, ldz, dZ, lddz);
        magma_cbulge_applyQ_v2('L', ne, n, nb, Vblksiz, dZ, lddz, V, ldv, T, ldt, info);
        magma_device_sync();
    }

    timeaplQ2 = magma_wtime()-timeaplQ2;

    magma_setlapack_numthreads(threads);
    return MAGMA_SUCCESS;
}
示例#3
0
extern "C" magma_int_t magma_cbulge_back(magma_int_t threads, char uplo, magma_int_t n, magma_int_t nb, magma_int_t ne, magma_int_t Vblksiz,
                                         cuFloatComplex *Z, magma_int_t ldz, cuFloatComplex *dZ, magma_int_t lddz,
                                         cuFloatComplex *V, magma_int_t ldv, cuFloatComplex *TAU, cuFloatComplex *T, magma_int_t ldt, magma_int_t* info)
{
    magma_int_t mklth = threads;
    
    float timeaplQ2=0.0;
    
#if defined(USEMKL)
        mkl_set_num_threads(1);
#endif
#if defined(USEACML)
        omp_set_num_threads(1);
#endif
    
            float f= 1.;
            magma_int_t n_gpu = ne;
            
            if(threads>40){
                f = 0.5;
                n_gpu = (magma_int_t)(f*ne)/64*64;
            }
            else if(threads>10){
#if (defined(PRECISION_s) || defined(PRECISION_d))
                f = 0.68;
#else
                f = 0.72;
#endif
                n_gpu = (magma_int_t)(f*ne)/64*64;
            }
            else if(threads>5){
#if (defined(PRECISION_s) || defined(PRECISION_d))
                f = 0.82;
#else
                f = 0.86;
#endif
                n_gpu = (magma_int_t)(f*ne)/64*64;
            }            
            else if(threads>1){
#if (defined(PRECISION_s) || defined(PRECISION_d))
                f = 0.96;
#else
                f = 0.96;
#endif
                n_gpu = (magma_int_t)(f*ne)/64*64;
            }
            
            /****************************************************
             *  apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z
             * **************************************************/

            timeaplQ2 = magma_wtime();
            
            /*============================
             *  use GPU+CPU's
             *==========================*/  
            
            if(n_gpu < ne)
            {
                
                // define the size of Q to be done on CPU's and the size on GPU's
                // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N)

                printf("---> calling GPU + CPU(if N_CPU>0) to apply V2 to Z with NE %d     N_GPU %d   N_CPU %d\n",ne, n_gpu, ne-n_gpu); 
                
                magma_capplyQ_data data_applyQ(threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt, dZ, lddz);
                
                magma_capplyQ_id_data* arg = new magma_capplyQ_id_data[threads];
                pthread_t* thread_id = new pthread_t[threads];
                
                pthread_attr_t thread_attr;
                
                // ===============================
                // relaunch thread to apply Q
                // ===============================
                // Set one thread per core
                pthread_attr_init(&thread_attr);
                pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM);
                pthread_setconcurrency(threads);
                
                // Launch threads
                for (magma_int_t thread = 1; thread < threads; thread++)
                {
                    arg[thread] = magma_capplyQ_id_data(thread, &data_applyQ);
                    pthread_create(&thread_id[thread], &thread_attr, magma_capplyQ_parallel_section, &arg[thread]);
                }
                arg[0] = magma_capplyQ_id_data(0, &data_applyQ);
                magma_capplyQ_parallel_section(&arg[0]);
                
                // Wait for completion
                for (magma_int_t thread = 1; thread < threads; thread++)
                {
                    void *exitcodep;
                    pthread_join(thread_id[thread], &exitcodep);
                }
                
                delete[] thread_id;
                delete[] arg;
                
                magma_csetmatrix(n, ne-n_gpu, Z + n_gpu*ldz, ldz, dZ + n_gpu*ldz, lddz);
                
                /*============================
                 *  use only GPU
                 *==========================*/  
            }else{
                magma_csetmatrix(n, ne, Z, ldz, dZ, lddz);
                magma_cbulge_applyQ_v2('L', ne, n, nb, Vblksiz, dZ, lddz, V, ldv, T, ldt, info);
                magma_device_sync();
            }

            timeaplQ2 = magma_wtime()-timeaplQ2;
            
#if defined(USEMKL)
        mkl_set_num_threads(mklth);
#endif
#if defined(USEACML)
        omp_set_num_threads(mklth);
#endif
    
    return MAGMA_SUCCESS;
}
示例#4
0
//##################################################################################################
static void *magma_capplyQ_parallel_section(void *arg)
{

    magma_int_t my_core_id   = ((magma_capplyQ_id_data*)arg) -> id;
    magma_capplyQ_data* data = ((magma_capplyQ_id_data*)arg) -> data;
    
    magma_int_t allcores_num   = data -> threads_num;
    magma_int_t n              = data -> n;
    magma_int_t ne             = data -> ne;
    magma_int_t n_gpu          = data -> n_gpu;
    magma_int_t nb             = data -> nb;
    magma_int_t Vblksiz        = data -> Vblksiz;
    cuFloatComplex *E         = data -> E;
    magma_int_t lde            = data -> lde;
    cuFloatComplex *V         = data -> V;
    magma_int_t ldv            = data -> ldv;
    cuFloatComplex *TAU       = data -> TAU;
    cuFloatComplex *T         = data -> T;
    magma_int_t ldt            = data -> ldt;
    cuFloatComplex *dE        = data -> dE;
    magma_int_t ldde           = data -> ldde;
    pthread_barrier_t* barrier = &(data -> barrier);
 
    magma_int_t info;
    
    real_Double_t timeQcpu=0.0, timeQgpu=0.0;
    
    magma_int_t n_cpu = ne - n_gpu;

#if defined(SETAFFINITY)    
    cpu_set_t set;
    CPU_ZERO( &set );
    CPU_SET( my_core_id, &set );
    sched_setaffinity( 0, sizeof(set), &set) ;
#endif
   
            if(my_core_id==0)
            {
                //=============================================
                //   on GPU on thread 0:
                //    - apply V2*Z(:,1:N_GPU)
                //=============================================
                timeQgpu = magma_wtime();
                
                magma_csetmatrix(n, n_gpu, E, lde, dE, ldde);
                magma_cbulge_applyQ_v2('L', n_gpu, n, nb, Vblksiz, dE, ldde, V, ldv, T, ldt, &info);
                
                magma_device_sync();
                timeQgpu = magma_wtime()-timeQgpu;
                printf("  Finish Q2_GPU GGG timing= %f \n" ,timeQgpu);

            }else{
                //=============================================
                //   on CPU on threads 1:allcores_num-1:
                //    - apply V2*Z(:,N_GPU+1:NE)
                //=============================================
                if(my_core_id == 1)
                    timeQcpu = magma_wtime();
                
                magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1);
                cuFloatComplex* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde;
                n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1));
                
                magma_ctile_bulge_applyQ('L', n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt);
                pthread_barrier_wait(barrier);
                if(my_core_id == 1){
                    timeQcpu = magma_wtime()-timeQcpu;
                    printf("  Finish Q2_CPU CCC timing= %f \n" ,timeQcpu);
                }
                
            } // END if my_core_id
        
    
#if defined(SETAFFINITY)    
    // unbind threads 
    magma_int_t sys_corenbr = 1;
    sys_corenbr = sysconf(_SC_NPROCESSORS_ONLN);
    CPU_ZERO( &set );
    for(magma_int_t i=0; i<sys_corenbr; i++)
        CPU_SET( i, &set );
    sched_setaffinity( 0, sizeof(set), &set) ;
#endif
    
    return 0;
}
示例#5
0
//##################################################################################################
static void *magma_capplyQ_parallel_section(void *arg)
{
    magma_int_t my_core_id   = ((magma_capplyQ_id_data*)arg) -> id;
    magma_capplyQ_data* data = ((magma_capplyQ_id_data*)arg) -> data;

    magma_int_t allcores_num   = data -> threads_num;
    magma_int_t n              = data -> n;
    magma_int_t ne             = data -> ne;
    magma_int_t n_gpu          = data -> n_gpu;
    magma_int_t nb             = data -> nb;
    magma_int_t Vblksiz        = data -> Vblksiz;
    magmaFloatComplex *E         = data -> E;
    magma_int_t lde            = data -> lde;
    magmaFloatComplex *V         = data -> V;
    magma_int_t ldv            = data -> ldv;
    magmaFloatComplex *TAU       = data -> TAU;
    magmaFloatComplex *T         = data -> T;
    magma_int_t ldt            = data -> ldt;
    magmaFloatComplex *dE        = data -> dE;
    magma_int_t ldde           = data -> ldde;
    pthread_barrier_t* barrier = &(data -> barrier);

    magma_int_t info;

    #ifdef ENABLE_TIMER
    real_Double_t timeQcpu=0.0, timeQgpu=0.0;
    #endif

    magma_int_t n_cpu = ne - n_gpu;

    // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads
    // it need that all threads setting it to 1.
    magma_set_lapack_numthreads(1);

#ifndef MAGMA_NOAFFINITY
    //#define PRINTAFFINITY
#ifdef PRINTAFFINITY
    affinity_set print_set;
    print_set.print_affinity(my_core_id, "starting affinity");
#endif
    cpu_set_t old_set, new_set;

    //store current affinity
    CPU_ZERO(&old_set);
    sched_getaffinity( 0, sizeof(old_set), &old_set);
    //set new affinity
    // bind threads
    CPU_ZERO(&new_set);
    CPU_SET(my_core_id, &new_set);
    sched_setaffinity( 0, sizeof(new_set), &new_set);
#ifdef PRINTAFFINITY
    print_set.print_affinity(my_core_id, "set affinity");
#endif
#endif

    if (my_core_id == 0) {
        //=============================================
        //   on GPU on thread 0:
        //    - apply V2*Z(:,1:N_GPU)
        //=============================================
        #ifdef ENABLE_TIMER
        timeQgpu = magma_wtime();
        #endif
        magma_queue_t queue;
        magma_device_t cdev;
        magma_getdevice( &cdev );
        magma_queue_create( cdev, &queue );

        magma_csetmatrix( n, n_gpu, E, lde, dE, ldde, queue );
        magma_cbulge_applyQ_v2(MagmaLeft, n_gpu, n, nb, Vblksiz, dE, ldde, V, ldv, T, ldt, &info);

        magma_queue_destroy( queue );
        
        #ifdef ENABLE_TIMER
        timeQgpu = magma_wtime()-timeQgpu;
        printf("  Finish Q2_GPU GGG timing= %f\n", timeQgpu);
        #endif
    } else {
        //=============================================
        //   on CPU on threads 1:allcores_num-1:
        //    - apply V2*Z(:,N_GPU+1:NE)
        //=============================================
        #ifdef ENABLE_TIMER
        if (my_core_id == 1)
            timeQcpu = magma_wtime();
        #endif

        magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1);
        magmaFloatComplex* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde;
        n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1));

        magma_ctile_bulge_applyQ(my_core_id, MagmaLeft, n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt);
        pthread_barrier_wait(barrier);

        #ifdef ENABLE_TIMER
        if (my_core_id == 1) {
            timeQcpu = magma_wtime()-timeQcpu;
            printf("  Finish Q2_CPU CCC timing= %f\n", timeQcpu);
        }
        #endif
    } // END if my_core_id

#ifndef MAGMA_NOAFFINITY
    //restore old affinity
    sched_setaffinity(0, sizeof(old_set), &old_set);
#ifdef PRINTAFFINITY
    print_set.print_affinity(my_core_id, "restored_affinity");
#endif
#endif

    return 0;
}