/***************************************************************************//** * Main thread control **/ void *plasma_parallel_section(void *plasma_ptr) { plasma_context_t *plasma = (plasma_context_t*)(plasma_ptr); PLASMA_enum action; /* Set thread affinity for the worker */ plasma_setaffinity(plasma->thread_bind[plasma_rank(plasma)]); plasma_barrier(plasma); while(1) { pthread_mutex_lock(&plasma->action_mutex); while ((action = plasma->action) == PLASMA_ACT_STAND_BY) pthread_cond_wait(&plasma->action_condt, &plasma->action_mutex); pthread_mutex_unlock(&plasma->action_mutex); plasma_barrier(plasma); switch (action) { case PLASMA_ACT_PARALLEL: plasma->parallel_func_ptr(plasma); break; case PLASMA_ACT_DYNAMIC: QUARK_Worker_Loop(plasma->quark, plasma_rank(plasma)); break; case PLASMA_ACT_FINALIZE: return NULL; default: plasma_fatal_error("plasma_parallel_section", "undefined action"); return NULL; } plasma_barrier(plasma); } plasma_unsetaffinity(); return NULL; }
/** **************************************************************************** * * @ingroup InPlaceTransformation * * plasma_pzpack pack all extra elements at the end of the matrix * * +---------------+ * | | * | | * | A11 | * | | * | | * +---------------+ * | A21 | * +---------------+ * * This matrix is initially stored as (example of Column Major, it's * the same for row major. We just consider the transpose matrix) : * A11(:,0), A21(:,0), A11(:,1), A21(:,1), ... * * On exit, it's stored as follow. * A11(:,:), A12(:,:) ******************************************************************************* * * @param[in] plasma * Plasma context * * @param[in] m * Number of rows in matrix A * * @param[in] n * Number of columns in matrix A * * @param[in,out] A * Matrix A to pack. (see above for entry and exit format) * * @param[in] m0 * Number of rows of A21 * ******************************************************************************/ void plasma_pzpack(plasma_context_t *plasma) { PLASMA_Complex64_t *A, *W, *Wl; PLASMA_sequence *sequence; PLASMA_request *request; int m, n, m0; int i, m1, size, rank, start, end, end2, bs, mod; plasma_unpack_args_6(m, n, A, m0, sequence, request); if (sequence->status != PLASMA_SUCCESS) return; /* Quick return */ if ( n <= 1 ) return; m1 = m - m0; size = PLASMA_SIZE; rank = PLASMA_RANK; mod = (n-1) % size; bs = (n-1) / size; start = rank * bs; if ( rank < mod ) { bs++; } start += min( mod, rank ); end = start+bs; W = (PLASMA_Complex64_t*)plasma_private_alloc(plasma, (m0*bs), PlasmaComplexDouble); Wl = (PLASMA_Complex64_t*)plasma_private_alloc(plasma, m1, PlasmaComplexDouble); /* Save leftover pieces that are otherwise going to be overwritten */ CORE_zlacpy( PlasmaUpperLower, m0, bs, &(A[(int64_t)start*m+m1]), m, W, m0 ); /* Pack A */ end2 = ((n-1) / size) * size + 1; for(i=rank+1; i<end2; i+=size) { memcpy( Wl, &(A[i*m]), m1*sizeof(PLASMA_Complex64_t)); plasma_barrier(plasma); memcpy( &(A[i*m1]), Wl, m1*sizeof(PLASMA_Complex64_t)); } if ( rank < (n - end2)) { i = end2 + rank; memcpy( Wl, &(A[i*m]), m1*sizeof(PLASMA_Complex64_t)); plasma_barrier(plasma); memcpy( &(A[i*m1]), Wl, m1*sizeof(PLASMA_Complex64_t)); } else plasma_barrier(plasma); /* Restore leftover pieces */ CORE_zlacpy( PlasmaUpperLower, m0, bs, W, m0, &(A[(int64_t)m1*n+start*m0]), m0 ); plasma_private_free(plasma, W); plasma_private_free(plasma, Wl); }
/***************************************************************************//** * * @ingroup Auxiliary * * PLASMA_Finalize - Finalize PLASMA. * ******************************************************************************* * * @return * \retval PLASMA_SUCCESS successful exit * ******************************************************************************/ int PLASMA_Finalize() { int core; int status; void *exitcodep; plasma_context_t *plasma; plasma = plasma_context_self(); if (plasma == NULL) { plasma_fatal_error("PLASMA_Finalize()", "PLASMA not initialized"); return PLASMA_ERR_NOT_INITIALIZED; } /* Terminate the dynamic scheduler */ plasma_dynamic_sync(); /* Free quark structures */ QUARK_Free(plasma->quark); /* Set termination action */ pthread_mutex_lock(&plasma->action_mutex); plasma->action = PLASMA_ACT_FINALIZE; pthread_mutex_unlock(&plasma->action_mutex); pthread_cond_broadcast(&plasma->action_condt); /* Barrier and clear action */ plasma_barrier(plasma); plasma->action = PLASMA_ACT_STAND_BY; // Join threads for (core = 1; core < plasma->world_size; core++) { status = pthread_join(plasma->thread_id[core], &exitcodep); if (status != 0) { plasma_fatal_error("PLASMA_Finalize", "pthread_join() failed"); return status; } } plasma_barrier_finalize(plasma); plasma_barrier_bw_finalize(plasma); /* Unbind main thread */ plasma_unsetaffinity(); /* Destroy thread attributes */ status = pthread_attr_destroy(&plasma->thread_attr); if (status != 0) plasma_fatal_error("PLASMA_Finalize", "pthread_attr_destroy() failed"); /* Destroy topology */ plasma_topology_finalize(); status = plasma_context_remove(plasma, pthread_self()); if (status != PLASMA_SUCCESS) { plasma_fatal_error("PLASMA_Finalize", "plasma_context_remove() failed"); return status; } /* Restore the concurency */ /* actually it's really bad, we shoulde set the concurrency only * if it's not already done and restore it only we had change it */ pthread_setconcurrency( 0 ); return PLASMA_SUCCESS; }
/***************************************************************************//** * * @ingroup Auxiliary * * PLASMA_Init_Affinity - Initialize PLASMA. * ******************************************************************************* * * @param[in] cores * Number of cores to use (threads to launch). * If cores = 0, cores = PLASMA_NUM_THREADS if it is set, the * system number of core otherwise. * * @param[in] coresbind * Array to specify where to bind each thread. * Each thread i is binded to coresbind[hwloc(i)] if hwloc is * provided, or to coresbind[i] otherwise. * If coresbind = NULL, coresbind = PLASMA_AFF_THREADS if it * is set, the identity function otherwise. * ******************************************************************************* * * @return * \retval PLASMA_SUCCESS successful exit * ******************************************************************************/ int PLASMA_Init_Affinity(int cores, int *coresbind) { plasma_context_t *plasma; int status; int core; /* Create context and insert in the context map */ plasma = plasma_context_create(); if (plasma == NULL) { plasma_fatal_error("PLASMA_Init", "plasma_context_create() failed"); return PLASMA_ERR_OUT_OF_RESOURCES; } status = plasma_context_insert(plasma, pthread_self()); if (status != PLASMA_SUCCESS) { plasma_fatal_error("PLASMA_Init", "plasma_context_insert() failed"); return PLASMA_ERR_OUT_OF_RESOURCES; } /* Init number of cores and topology */ plasma_topology_init(); /* Set number of cores */ if ( cores < 1 ) { plasma->world_size = plasma_get_numthreads(); if ( plasma->world_size == -1 ) { plasma->world_size = 1; plasma_warning("PLASMA_Init", "Could not find the number of cores: the thread number is set to 1"); } } else plasma->world_size = cores; if (plasma->world_size <= 0) { plasma_fatal_error("PLASMA_Init", "failed to get system size"); return PLASMA_ERR_NOT_FOUND; } /* Check if not more cores than the hard limit */ if (plasma->world_size > CONTEXT_THREADS_MAX) { plasma_fatal_error("PLASMA_Init", "not supporting so many cores"); return PLASMA_ERR_INTERNAL_LIMIT; } /* Get the size of each NUMA node */ plasma->group_size = plasma_get_numthreads_numa(); while ( ((plasma->world_size)%(plasma->group_size)) != 0 ) (plasma->group_size)--; /* Initialize barriers */ plasma_barrier_init(plasma); plasma_barrier_bw_init(plasma); /* Initialize default thread attributes */ status = pthread_attr_init(&plasma->thread_attr); if (status != 0) { plasma_fatal_error("PLASMA_Init", "pthread_attr_init() failed"); return status; } /* Set scope to system */ status = pthread_attr_setscope(&plasma->thread_attr, PTHREAD_SCOPE_SYSTEM); if (status != 0) { plasma_fatal_error("PLASMA_Init", "pthread_attr_setscope() failed"); return status; } /* Set concurrency */ status = pthread_setconcurrency(plasma->world_size); if (status != 0) { plasma_fatal_error("PLASMA_Init", "pthread_setconcurrency() failed"); return status; } /* Launch threads */ memset(plasma->thread_id, 0, CONTEXT_THREADS_MAX*sizeof(pthread_t)); if (coresbind != NULL) { memcpy(plasma->thread_bind, coresbind, plasma->world_size*sizeof(int)); } else { plasma_get_affthreads(plasma->thread_bind); } /* Assign rank and thread ID for the master */ plasma->thread_rank[0] = 0; plasma->thread_id[0] = pthread_self(); for (core = 1; core < plasma->world_size; core++) { plasma->thread_rank[core] = core; pthread_create( &plasma->thread_id[core], &plasma->thread_attr, plasma_parallel_section, (void*)plasma); } /* Ensure BLAS are sequential and set thread affinity for the master */ #if defined(PLASMA_WITH_MKL) #if defined(__ICC) || defined(__INTEL_COMPILER) kmp_set_defaults("KMP_AFFINITY=disabled"); #endif #endif /* Initialize the dynamic scheduler */ plasma->quark = QUARK_Setup(plasma->world_size); plasma_barrier(plasma); plasma_setlapack_sequential(plasma); return PLASMA_SUCCESS; }