Beispiel #1
0
//##################################################################################################
static void *magma_capplyQ_parallel_section(void *arg)
{

    magma_int_t my_core_id   = ((magma_capplyQ_id_data*)arg) -> id;
    magma_capplyQ_data* data = ((magma_capplyQ_id_data*)arg) -> data;
    
    magma_int_t allcores_num   = data -> threads_num;
    magma_int_t n              = data -> n;
    magma_int_t ne             = data -> ne;
    magma_int_t n_gpu          = data -> n_gpu;
    magma_int_t nb             = data -> nb;
    magma_int_t Vblksiz        = data -> Vblksiz;
    cuFloatComplex *E         = data -> E;
    magma_int_t lde            = data -> lde;
    cuFloatComplex *V         = data -> V;
    magma_int_t ldv            = data -> ldv;
    cuFloatComplex *TAU       = data -> TAU;
    cuFloatComplex *T         = data -> T;
    magma_int_t ldt            = data -> ldt;
    cuFloatComplex *dE        = data -> dE;
    magma_int_t ldde           = data -> ldde;
    pthread_barrier_t* barrier = &(data -> barrier);
 
    magma_int_t info;
    
    real_Double_t timeQcpu=0.0, timeQgpu=0.0;
    
    magma_int_t n_cpu = ne - n_gpu;

#if defined(SETAFFINITY)    
    cpu_set_t set;
    CPU_ZERO( &set );
    CPU_SET( my_core_id, &set );
    sched_setaffinity( 0, sizeof(set), &set) ;
#endif
   
            if(my_core_id==0)
            {
                //=============================================
                //   on GPU on thread 0:
                //    - apply V2*Z(:,1:N_GPU)
                //=============================================
                timeQgpu = magma_wtime();
                
                magma_csetmatrix(n, n_gpu, E, lde, dE, ldde);
                magma_cbulge_applyQ_v2('L', n_gpu, n, nb, Vblksiz, dE, ldde, V, ldv, T, ldt, &info);
                
                magma_device_sync();
                timeQgpu = magma_wtime()-timeQgpu;
                printf("  Finish Q2_GPU GGG timing= %f \n" ,timeQgpu);

            }else{
                //=============================================
                //   on CPU on threads 1:allcores_num-1:
                //    - apply V2*Z(:,N_GPU+1:NE)
                //=============================================
                if(my_core_id == 1)
                    timeQcpu = magma_wtime();
                
                magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1);
                cuFloatComplex* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde;
                n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1));
                
                magma_ctile_bulge_applyQ('L', n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt);
                pthread_barrier_wait(barrier);
                if(my_core_id == 1){
                    timeQcpu = magma_wtime()-timeQcpu;
                    printf("  Finish Q2_CPU CCC timing= %f \n" ,timeQcpu);
                }
                
            } // END if my_core_id
        
    
#if defined(SETAFFINITY)    
    // unbind threads 
    magma_int_t sys_corenbr = 1;
    sys_corenbr = sysconf(_SC_NPROCESSORS_ONLN);
    CPU_ZERO( &set );
    for(magma_int_t i=0; i<sys_corenbr; i++)
        CPU_SET( i, &set );
    sched_setaffinity( 0, sizeof(set), &set) ;
#endif
    
    return 0;
}
Beispiel #2
0
//##################################################################################################
static void *magma_capplyQ_parallel_section(void *arg)
{

    magma_int_t my_core_id   = ((magma_capplyQ_id_data*)arg) -> id;
    magma_capplyQ_data* data = ((magma_capplyQ_id_data*)arg) -> data;

    magma_int_t allcores_num   = data -> threads_num;
    magma_int_t n              = data -> n;
    magma_int_t ne             = data -> ne;
    magma_int_t n_gpu          = data -> n_gpu;
    magma_int_t nb             = data -> nb;
    magma_int_t Vblksiz        = data -> Vblksiz;
    magmaFloatComplex *E         = data -> E;
    magma_int_t lde            = data -> lde;
    magmaFloatComplex *V         = data -> V;
    magma_int_t ldv            = data -> ldv;
    magmaFloatComplex *TAU       = data -> TAU;
    magmaFloatComplex *T         = data -> T;
    magma_int_t ldt            = data -> ldt;
    magmaFloatComplex *dE        = data -> dE;
    magma_int_t ldde           = data -> ldde;
    pthread_barrier_t* barrier = &(data -> barrier);

    magma_int_t info;

#ifdef ENABLE_TIMER
    real_Double_t timeQcpu=0.0, timeQgpu=0.0;
#endif

    magma_int_t n_cpu = ne - n_gpu;

    // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads
    // it need that all threads setting it to 1.
    magma_setlapack_numthreads(1);

#ifdef MAGMA_SETAFFINITY
    //#define PRINTAFFINITY
#ifdef PRINTAFFINITY
    affinity_set print_set;
    print_set.print_affinity(my_core_id, "starting affinity");
#endif
    affinity_set original_set;
    affinity_set new_set(my_core_id);
    int check  = 0;
    int check2 = 0;
    // bind threads
    check = original_set.get_affinity();
    if (check == 0) {
        check2 = new_set.set_affinity();
        if (check2 != 0)
            printf("Error in sched_setaffinity (single cpu)\n");
    }
    else
    {
        printf("Error in sched_getaffinity\n");
    }
#ifdef PRINTAFFINITY
    print_set.print_affinity(my_core_id, "set affinity");
#endif
#endif

    if(my_core_id==0)
    {
        //=============================================
        //   on GPU on thread 0:
        //    - apply V2*Z(:,1:N_GPU)
        //=============================================
#ifdef ENABLE_TIMER
        timeQgpu = magma_wtime();
#endif

        magma_csetmatrix(n, n_gpu, E, lde, dE, ldde);
        magma_cbulge_applyQ_v2('L', n_gpu, n, nb, Vblksiz, dE, ldde, V, ldv, T, ldt, &info);
        magma_device_sync();

#ifdef ENABLE_TIMER
        timeQgpu = magma_wtime()-timeQgpu;
        printf("  Finish Q2_GPU GGG timing= %f \n" ,timeQgpu);
#endif
    } else {
        //=============================================
        //   on CPU on threads 1:allcores_num-1:
        //    - apply V2*Z(:,N_GPU+1:NE)
        //=============================================
#ifdef ENABLE_TIMER
        if(my_core_id == 1)
            timeQcpu = magma_wtime();
#endif

        magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1);
        magmaFloatComplex* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde;
        n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1));

        magma_ctile_bulge_applyQ(my_core_id, 'L', n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt);
        pthread_barrier_wait(barrier);

#ifdef ENABLE_TIMER
        if(my_core_id == 1) {
            timeQcpu = magma_wtime()-timeQcpu;
            printf("  Finish Q2_CPU CCC timing= %f \n" ,timeQcpu);
        }
#endif

    } // END if my_core_id

#ifdef MAGMA_SETAFFINITY
    // unbind threads
    if (check == 0) {
        check2 = original_set.set_affinity();
        if (check2 != 0)
            printf("Error in sched_setaffinity (restore cpu list)\n");
    }
#ifdef PRINTAFFINITY
    print_set.print_affinity(my_core_id, "restored_affinity");
#endif
#endif

    return 0;
}
Beispiel #3
0
//##################################################################################################
static void *magma_capplyQ_parallel_section(void *arg)
{
    magma_int_t my_core_id   = ((magma_capplyQ_id_data*)arg) -> id;
    magma_capplyQ_data* data = ((magma_capplyQ_id_data*)arg) -> data;

    magma_int_t allcores_num   = data -> threads_num;
    magma_int_t n              = data -> n;
    magma_int_t ne             = data -> ne;
    magma_int_t n_gpu          = data -> n_gpu;
    magma_int_t nb             = data -> nb;
    magma_int_t Vblksiz        = data -> Vblksiz;
    magmaFloatComplex *E         = data -> E;
    magma_int_t lde            = data -> lde;
    magmaFloatComplex *V         = data -> V;
    magma_int_t ldv            = data -> ldv;
    magmaFloatComplex *TAU       = data -> TAU;
    magmaFloatComplex *T         = data -> T;
    magma_int_t ldt            = data -> ldt;
    magmaFloatComplex *dE        = data -> dE;
    magma_int_t ldde           = data -> ldde;
    pthread_barrier_t* barrier = &(data -> barrier);

    magma_int_t info;

    #ifdef ENABLE_TIMER
    real_Double_t timeQcpu=0.0, timeQgpu=0.0;
    #endif

    magma_int_t n_cpu = ne - n_gpu;

    // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads
    // it need that all threads setting it to 1.
    magma_set_lapack_numthreads(1);

#ifndef MAGMA_NOAFFINITY
    //#define PRINTAFFINITY
#ifdef PRINTAFFINITY
    affinity_set print_set;
    print_set.print_affinity(my_core_id, "starting affinity");
#endif
    cpu_set_t old_set, new_set;

    //store current affinity
    CPU_ZERO(&old_set);
    sched_getaffinity( 0, sizeof(old_set), &old_set);
    //set new affinity
    // bind threads
    CPU_ZERO(&new_set);
    CPU_SET(my_core_id, &new_set);
    sched_setaffinity( 0, sizeof(new_set), &new_set);
#ifdef PRINTAFFINITY
    print_set.print_affinity(my_core_id, "set affinity");
#endif
#endif

    if (my_core_id == 0) {
        //=============================================
        //   on GPU on thread 0:
        //    - apply V2*Z(:,1:N_GPU)
        //=============================================
        #ifdef ENABLE_TIMER
        timeQgpu = magma_wtime();
        #endif
        magma_queue_t queue;
        magma_device_t cdev;
        magma_getdevice( &cdev );
        magma_queue_create( cdev, &queue );

        magma_csetmatrix( n, n_gpu, E, lde, dE, ldde, queue );
        magma_cbulge_applyQ_v2(MagmaLeft, n_gpu, n, nb, Vblksiz, dE, ldde, V, ldv, T, ldt, &info);

        magma_queue_destroy( queue );
        
        #ifdef ENABLE_TIMER
        timeQgpu = magma_wtime()-timeQgpu;
        printf("  Finish Q2_GPU GGG timing= %f\n", timeQgpu);
        #endif
    } else {
        //=============================================
        //   on CPU on threads 1:allcores_num-1:
        //    - apply V2*Z(:,N_GPU+1:NE)
        //=============================================
        #ifdef ENABLE_TIMER
        if (my_core_id == 1)
            timeQcpu = magma_wtime();
        #endif

        magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1);
        magmaFloatComplex* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde;
        n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1));

        magma_ctile_bulge_applyQ(my_core_id, MagmaLeft, n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt);
        pthread_barrier_wait(barrier);

        #ifdef ENABLE_TIMER
        if (my_core_id == 1) {
            timeQcpu = magma_wtime()-timeQcpu;
            printf("  Finish Q2_CPU CCC timing= %f\n", timeQcpu);
        }
        #endif
    } // END if my_core_id

#ifndef MAGMA_NOAFFINITY
    //restore old affinity
    sched_setaffinity(0, sizeof(old_set), &old_set);
#ifdef PRINTAFFINITY
    print_set.print_affinity(my_core_id, "restored_affinity");
#endif
#endif

    return 0;
}