int main(void) { hwloc_topology_t topology; hwloc_obj_t obj; hwloc_topology_init(&topology); hwloc_topology_set_io_types_filter(topology, HWLOC_TYPE_FILTER_KEEP_ALL); hwloc_topology_load(topology); printf("Found %d bridges\n", hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_BRIDGE)); obj = NULL; while ((obj = hwloc_get_next_bridge(topology, obj)) != NULL) { assert(obj->type == HWLOC_OBJ_BRIDGE); /* only host->pci and pci->pci bridge supported so far */ if (obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST) { assert(obj->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI); printf(" Found host->PCI bridge for domain %04x bus %02x-%02x\n", obj->attr->bridge.downstream.pci.domain, obj->attr->bridge.downstream.pci.secondary_bus, obj->attr->bridge.downstream.pci.subordinate_bus); } else { assert(obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI); assert(obj->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI); printf(" Found PCI->PCI bridge [%04x:%04x] for domain %04x bus %02x-%02x\n", obj->attr->bridge.upstream.pci.vendor_id, obj->attr->bridge.upstream.pci.device_id, obj->attr->bridge.downstream.pci.domain, obj->attr->bridge.downstream.pci.secondary_bus, obj->attr->bridge.downstream.pci.subordinate_bus); } } printf("Found %d PCI devices\n", hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PCI_DEVICE)); obj = NULL; while ((obj = hwloc_get_next_pcidev(topology, obj)) != NULL) { assert(obj->type == HWLOC_OBJ_PCI_DEVICE); printf(" Found PCI device class %04x vendor %04x model %04x\n", obj->attr->pcidev.class_id, obj->attr->pcidev.vendor_id, obj->attr->pcidev.device_id); } printf("Found %d OS devices\n", hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_OS_DEVICE)); obj = NULL; while ((obj = hwloc_get_next_osdev(topology, obj)) != NULL) { assert(obj->type == HWLOC_OBJ_OS_DEVICE); printf(" Found OS device %s subtype %d\n", obj->name, obj->attr->osdev.type); } assert(HWLOC_TYPE_DEPTH_BRIDGE == hwloc_get_type_depth(topology, HWLOC_OBJ_BRIDGE)); assert(HWLOC_TYPE_DEPTH_PCI_DEVICE == hwloc_get_type_depth(topology, HWLOC_OBJ_PCI_DEVICE)); assert(HWLOC_TYPE_DEPTH_OS_DEVICE == hwloc_get_type_depth(topology, HWLOC_OBJ_OS_DEVICE)); assert(hwloc_compare_types(HWLOC_OBJ_BRIDGE, HWLOC_OBJ_PCI_DEVICE) < 0); assert(hwloc_compare_types(HWLOC_OBJ_BRIDGE, HWLOC_OBJ_OS_DEVICE) < 0); assert(hwloc_compare_types(HWLOC_OBJ_PCI_DEVICE, HWLOC_OBJ_OS_DEVICE) < 0); hwloc_topology_destroy(topology); return 0; }
int MPIR_Init_thread(int *argc, char ***argv, int required, int *provided) { int mpi_errno = MPI_SUCCESS; int has_args; int has_env; int thread_provided = 0; int exit_init_cs_on_failure = 0; MPIR_Info *info_ptr; #if defined(MPICH_IS_THREADED) bool cs_initialized = false; #endif /* The threading library must be initialized at the very beginning because * it manages all synchronization objects (e.g., mutexes) that will be * initialized later */ { int thread_err; MPL_thread_init(&thread_err); if (thread_err) goto fn_fail; } #ifdef HAVE_HWLOC MPIR_Process.bindset = hwloc_bitmap_alloc(); hwloc_topology_init(&MPIR_Process.hwloc_topology); MPIR_Process.bindset_is_valid = 0; hwloc_topology_set_io_types_filter(MPIR_Process.hwloc_topology, HWLOC_TYPE_FILTER_KEEP_ALL); if (!hwloc_topology_load(MPIR_Process.hwloc_topology)) { MPIR_Process.bindset_is_valid = !hwloc_get_proc_cpubind(MPIR_Process.hwloc_topology, getpid(), MPIR_Process.bindset, HWLOC_CPUBIND_PROCESS); } #endif #ifdef HAVE_NETLOC MPIR_Process.network_attr.u.tree.node_levels = NULL; MPIR_Process.network_attr.network_endpoint = NULL; MPIR_Process.netloc_topology = NULL; MPIR_Process.network_attr.type = MPIR_NETLOC_NETWORK_TYPE__INVALID; if (strlen(MPIR_CVAR_NETLOC_NODE_FILE)) { mpi_errno = netloc_parse_topology(&MPIR_Process.netloc_topology, MPIR_CVAR_NETLOC_NODE_FILE); if (mpi_errno == NETLOC_SUCCESS) { MPIR_Netloc_parse_topology(MPIR_Process.netloc_topology, &MPIR_Process.network_attr); } } #endif /* For any code in the device that wants to check for runtime * decisions on the value of isThreaded, set a provisional * value here. We could let the MPID_Init routine override this */ #if defined MPICH_IS_THREADED MPIR_ThreadInfo.isThreaded = required == MPI_THREAD_MULTIPLE; #endif /* MPICH_IS_THREADED */ #if defined(MPICH_IS_THREADED) mpi_errno = thread_cs_init(); cs_initialized = true; if (mpi_errno) MPIR_ERR_POP(mpi_errno); #endif /* FIXME: Move to os-dependent interface? */ #ifdef HAVE_WINDOWS_H /* prevent the process from bringing up an error message window if mpich * asserts */ _CrtSetReportMode(_CRT_ASSERT, _CRTDBG_MODE_FILE); _CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR); _CrtSetReportHook2(_CRT_RPTHOOK_INSTALL, assert_hook); #ifdef _WIN64 { /* FIXME: (Windows) This severly degrades performance but fixes alignment * issues with the datatype code. */ /* Prevent misaligned faults on Win64 machines */ UINT mode, old_mode; old_mode = SetErrorMode(SEM_NOALIGNMENTFAULTEXCEPT); mode = old_mode | SEM_NOALIGNMENTFAULTEXCEPT; SetErrorMode(mode); } #endif #endif /* We need this inorder to implement IS_THREAD_MAIN */ #if (MPICH_THREAD_LEVEL >= MPI_THREAD_SERIALIZED) && defined(MPICH_IS_THREADED) { MPID_Thread_self(&MPIR_ThreadInfo.master_thread); } #endif #ifdef HAVE_ERROR_CHECKING /* Because the PARAM system has not been initialized, temporarily * uncondtionally enable error checks. Once the PARAM system is * initialized, this may be reset */ MPIR_Process.do_error_checks = 1; #else MPIR_Process.do_error_checks = 0; #endif /* Initialize necessary subsystems and setup the predefined attribute * values. Subsystems may change these values. */ MPIR_Process.attrs.appnum = -1; MPIR_Process.attrs.host = MPI_PROC_NULL; MPIR_Process.attrs.io = MPI_PROC_NULL; MPIR_Process.attrs.lastusedcode = MPI_ERR_LASTCODE; MPIR_Process.attrs.universe = MPIR_UNIVERSE_SIZE_NOT_SET; MPIR_Process.attrs.wtime_is_global = 0; /* Set the functions used to duplicate attributes. These are * when the first corresponding keyval is created */ MPIR_Process.attr_dup = 0; MPIR_Process.attr_free = 0; #ifdef HAVE_CXX_BINDING /* Set the functions used to call functions in the C++ binding * for reductions and attribute operations. These are null * until a C++ operation is defined. This allows the C code * that implements these operations to not invoke a C++ code * directly, which may force the inclusion of symbols known only * to the C++ compiler (e.g., under more non-GNU compilers, including * Solaris and IRIX). */ MPIR_Process.cxx_call_op_fn = 0; #endif #ifdef HAVE_F08_BINDING MPIR_C_MPI_UNWEIGHTED = MPI_UNWEIGHTED; MPIR_C_MPI_WEIGHTS_EMPTY = MPI_WEIGHTS_EMPTY; #endif /* This allows the device to select an alternative function for * dimsCreate */ MPIR_Process.dimsCreate = 0; /* "Allocate" from the reserved space for builtin communicators and * (partially) initialize predefined communicators. comm_parent is * intially NULL and will be allocated by the device if the process group * was started using one of the MPI_Comm_spawn functions. */ MPIR_Process.comm_world = MPIR_Comm_builtin + 0; MPII_Comm_init(MPIR_Process.comm_world); MPIR_Process.comm_world->handle = MPI_COMM_WORLD; MPIR_Process.comm_world->context_id = 0 << MPIR_CONTEXT_PREFIX_SHIFT; MPIR_Process.comm_world->recvcontext_id = 0 << MPIR_CONTEXT_PREFIX_SHIFT; MPIR_Process.comm_world->comm_kind = MPIR_COMM_KIND__INTRACOMM; /* This initialization of the comm name could be done only when * comm_get_name is called */ MPL_strncpy(MPIR_Process.comm_world->name, "MPI_COMM_WORLD", MPI_MAX_OBJECT_NAME); MPIR_Process.comm_self = MPIR_Comm_builtin + 1; MPII_Comm_init(MPIR_Process.comm_self); MPIR_Process.comm_self->handle = MPI_COMM_SELF; MPIR_Process.comm_self->context_id = 1 << MPIR_CONTEXT_PREFIX_SHIFT; MPIR_Process.comm_self->recvcontext_id = 1 << MPIR_CONTEXT_PREFIX_SHIFT; MPIR_Process.comm_self->comm_kind = MPIR_COMM_KIND__INTRACOMM; MPL_strncpy(MPIR_Process.comm_self->name, "MPI_COMM_SELF", MPI_MAX_OBJECT_NAME); #ifdef MPID_NEEDS_ICOMM_WORLD MPIR_Process.icomm_world = MPIR_Comm_builtin + 2; MPII_Comm_init(MPIR_Process.icomm_world); MPIR_Process.icomm_world->handle = MPIR_ICOMM_WORLD; MPIR_Process.icomm_world->context_id = 2 << MPIR_CONTEXT_PREFIX_SHIFT; MPIR_Process.icomm_world->recvcontext_id = 2 << MPIR_CONTEXT_PREFIX_SHIFT; MPIR_Process.icomm_world->comm_kind = MPIR_COMM_KIND__INTRACOMM; MPL_strncpy(MPIR_Process.icomm_world->name, "MPI_ICOMM_WORLD", MPI_MAX_OBJECT_NAME); /* Note that these communicators are not ready for use - MPID_Init * will setup self and world, and icomm_world if it desires it. */ #endif MPIR_Process.comm_parent = NULL; /* Setup the initial communicator list in case we have * enabled the debugger message-queue interface */ MPII_COMML_REMEMBER(MPIR_Process.comm_world); MPII_COMML_REMEMBER(MPIR_Process.comm_self); /* MPIU_Timer_pre_init(); */ /* Wait for debugger to attach if requested. */ if (MPIR_CVAR_DEBUG_HOLD) { volatile int hold = 1; while (hold) #ifdef HAVE_USLEEP usleep(100); #endif ; } #if defined(HAVE_ERROR_CHECKING) && (HAVE_ERROR_CHECKING == MPID_ERROR_LEVEL_RUNTIME) MPIR_Process.do_error_checks = MPIR_CVAR_ERROR_CHECKING; #endif /* define MPI as initialized so that we can use MPI functions within * MPID_Init if necessary */ OPA_store_int(&MPIR_Process.mpich_state, MPICH_MPI_STATE__IN_INIT); /* We can't acquire any critical sections until this point. Any * earlier the basic data structures haven't been initialized */ MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); exit_init_cs_on_failure = 1; /* create MPI_INFO_NULL object */ /* FIXME: Currently this info object is empty, we need to add data to this * as defined by the standard. */ info_ptr = MPIR_Info_builtin + 1; info_ptr->handle = MPI_INFO_ENV; MPIR_Object_set_ref(info_ptr, 1); info_ptr->next = NULL; info_ptr->key = NULL; info_ptr->value = NULL; #ifdef USE_MEMORY_TRACING MPL_trinit(); #endif /* Set the number of tag bits. The device may override this value. */ MPIR_Process.tag_bits = MPIR_TAG_BITS_DEFAULT; /* Create complete request to return in the event of immediately complete * operations. Use a SEND request to cover all possible use-cases. */ MPIR_Process.lw_req = MPIR_Request_create(MPIR_REQUEST_KIND__SEND); MPIR_ERR_CHKANDSTMT(MPIR_Process.lw_req == NULL, mpi_errno, MPIX_ERR_NOREQ, goto fn_fail, "**nomemreq"); MPIR_cc_set(&MPIR_Process.lw_req->cc, 0); mpi_errno = MPID_Init(argc, argv, required, &thread_provided, &has_args, &has_env); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* Initialize collectives infrastructure */ mpi_errno = MPII_Coll_init(); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* Set tag_ub as function of tag_bits set by the device */ MPIR_Process.attrs.tag_ub = MPIR_TAG_USABLE_BITS; /* Assert: tag_ub should be a power of 2 minus 1 */ MPIR_Assert(((unsigned) MPIR_Process. attrs.tag_ub & ((unsigned) MPIR_Process.attrs.tag_ub + 1)) == 0); /* Assert: tag_ub is at least the minimum asked for in the MPI spec */ MPIR_Assert(MPIR_Process.attrs.tag_ub >= 32767); /* Capture the level of thread support provided */ MPIR_ThreadInfo.thread_provided = thread_provided; if (provided) *provided = thread_provided; #if defined MPICH_IS_THREADED MPIR_ThreadInfo.isThreaded = (thread_provided == MPI_THREAD_MULTIPLE); #endif /* MPICH_IS_THREADED */ /* FIXME: Define these in the interface. Does Timer init belong here? */ MPII_Timer_init(MPIR_Process.comm_world->rank, MPIR_Process.comm_world->local_size); #ifdef USE_MEMORY_TRACING #ifdef MPICH_IS_THREADED MPL_trconfig(MPIR_Process.comm_world->rank, MPIR_ThreadInfo.isThreaded); #else MPL_trconfig(MPIR_Process.comm_world->rank, 0); #endif /* Indicate that we are near the end of the init step; memory * allocated already will have an id of zero; this helps * separate memory leaks in the initialization code from * leaks in the "active" code */ #endif #ifdef MPL_USE_DBG_LOGGING /* FIXME: This is a hack to handle the common case of two worlds. * If the parent comm is not NULL, we always give the world number * as "1" (false). */ #ifdef MPICH_IS_THREADED MPL_dbg_init(argc, argv, has_args, has_env, MPIR_Process.comm_parent != NULL, MPIR_Process.comm_world->rank, MPIR_ThreadInfo.isThreaded); #else MPL_dbg_init(argc, argv, has_args, has_env, MPIR_Process.comm_parent != NULL, MPIR_Process.comm_world->rank, 0); #endif MPIR_DBG_INIT = MPL_dbg_class_alloc("INIT", "init"); MPIR_DBG_PT2PT = MPL_dbg_class_alloc("PT2PT", "pt2pt"); MPIR_DBG_THREAD = MPL_dbg_class_alloc("THREAD", "thread"); MPIR_DBG_DATATYPE = MPL_dbg_class_alloc("DATATYPE", "datatype"); MPIR_DBG_HANDLE = MPL_dbg_class_alloc("HANDLE", "handle"); MPIR_DBG_COMM = MPL_dbg_class_alloc("COMM", "comm"); MPIR_DBG_BSEND = MPL_dbg_class_alloc("BSEND", "bsend"); MPIR_DBG_ERRHAND = MPL_dbg_class_alloc("ERRHAND", "errhand"); MPIR_DBG_OTHER = MPL_dbg_class_alloc("OTHER", "other"); MPIR_DBG_REQUEST = MPL_dbg_class_alloc("REQUEST", "request"); MPIR_DBG_COLL = MPL_dbg_class_alloc("COLL", "coll"); MPIR_DBG_ASSERT = MPL_dbg_class_alloc("ASSERT", "assert"); MPIR_DBG_STRING = MPL_dbg_class_alloc("STRING", "string"); #endif /* Initialize the C versions of the Fortran link-time constants. * * We now initialize the Fortran symbols from within the Fortran * interface in the routine that first needs the symbols. * This fixes a problem with symbols added by a Fortran compiler that * are not part of the C runtime environment (the Portland group * compilers would do this) */ #if defined(HAVE_FORTRAN_BINDING) && defined(HAVE_MPI_F_INIT_WORKS_WITH_C) mpirinitf_(); #endif /* FIXME: Does this need to come before the call to MPID_InitComplete? * For some debugger support, MPII_Wait_for_debugger may want to use * MPI communication routines to collect information for the debugger */ #ifdef HAVE_DEBUGGER_SUPPORT MPII_Wait_for_debugger(); #endif /* Let the device know that the rest of the init process is completed */ if (mpi_errno == MPI_SUCCESS) mpi_errno = MPID_InitCompleted(); MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); /* Make fields of MPIR_Process global visible and set mpich_state * atomically so that MPI_Initialized() etc. are thread safe */ OPA_write_barrier(); OPA_store_int(&MPIR_Process.mpich_state, MPICH_MPI_STATE__POST_INIT); return mpi_errno; fn_fail: /* --BEGIN ERROR HANDLING-- */ /* signal to error handling routines that core services are unavailable */ OPA_store_int(&MPIR_Process.mpich_state, MPICH_MPI_STATE__PRE_INIT); if (exit_init_cs_on_failure) { MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); } #if defined(MPICH_IS_THREADED) if (cs_initialized) { MPIR_Thread_CS_Finalize(); } #endif return mpi_errno; /* --END ERROR HANDLING-- */ }
int pocl_topology_detect_device_info(cl_device_id device) { hwloc_topology_t pocl_topology; int ret = 0; #ifdef HWLOC_API_2 if (hwloc_get_api_version () < 0x20000) POCL_MSG_ERR ("pocl was compiled against libhwloc 2.x but is" "actually running against libhwloc 1.x \n"); #else if (hwloc_get_api_version () >= 0x20000) POCL_MSG_ERR ("pocl was compiled against libhwloc 1.x but is" "actually running against libhwloc 2.x \n"); #endif /* * hwloc's OpenCL backend causes problems at the initialization stage * because it reloads libpocl.so via the ICD loader. * * See: https://github.com/pocl/pocl/issues/261 * * The only trick to stop hwloc from initializing the OpenCL plugin * I could find is to point the plugin search path to a place where there * are no plugins to be found. */ setenv ("HWLOC_PLUGINS_PATH", "/dev/null", 1); ret = hwloc_topology_init (&pocl_topology); if (ret == -1) { POCL_MSG_ERR ("Cannot initialize the topology.\n"); return ret; } #ifdef HWLOC_API_2 hwloc_topology_set_io_types_filter(pocl_topology, HWLOC_TYPE_FILTER_KEEP_NONE); hwloc_topology_set_type_filter (pocl_topology, HWLOC_OBJ_SYSTEM, HWLOC_TYPE_FILTER_KEEP_NONE); hwloc_topology_set_type_filter (pocl_topology, HWLOC_OBJ_GROUP, HWLOC_TYPE_FILTER_KEEP_NONE); hwloc_topology_set_type_filter (pocl_topology, HWLOC_OBJ_BRIDGE, HWLOC_TYPE_FILTER_KEEP_NONE); hwloc_topology_set_type_filter (pocl_topology, HWLOC_OBJ_MISC, HWLOC_TYPE_FILTER_KEEP_NONE); hwloc_topology_set_type_filter (pocl_topology, HWLOC_OBJ_PCI_DEVICE, HWLOC_TYPE_FILTER_KEEP_NONE); hwloc_topology_set_type_filter (pocl_topology, HWLOC_OBJ_OS_DEVICE, HWLOC_TYPE_FILTER_KEEP_NONE); #else hwloc_topology_ignore_type (pocl_topology, HWLOC_TOPOLOGY_FLAG_WHOLE_IO); hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_SYSTEM); hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_GROUP); hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_BRIDGE); hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_MISC); hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_PCI_DEVICE); hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_OS_DEVICE); #endif ret = hwloc_topology_load (pocl_topology); if (ret == -1) { POCL_MSG_ERR ("Cannot load the topology.\n"); goto exit_destroy; } #ifdef HWLOC_API_2 device->global_mem_size = hwloc_get_root_obj(pocl_topology)->total_memory; #else device->global_mem_size = hwloc_get_root_obj(pocl_topology)->memory.total_memory; #endif // Try to get the number of CPU cores from topology int depth = hwloc_get_type_depth(pocl_topology, HWLOC_OBJ_PU); if(depth != HWLOC_TYPE_DEPTH_UNKNOWN) device->max_compute_units = hwloc_get_nbobjs_by_depth(pocl_topology, depth); /* Find information about global memory cache by looking at the first * cache covering the first PU */ do { size_t cache_size = 0, cacheline_size = 0; hwloc_obj_t core = hwloc_get_next_obj_by_type (pocl_topology, HWLOC_OBJ_CORE, NULL); if (core) { hwloc_obj_t cache = hwloc_get_shared_cache_covering_obj (pocl_topology, core); if ((cache) && (cache->attr)) { cacheline_size = cache->attr->cache.linesize; cache_size = cache->attr->cache.size; } else core = NULL; /* fallback to L1 cache size */ } hwloc_obj_t pu = hwloc_get_next_obj_by_type (pocl_topology, HWLOC_OBJ_PU, NULL); if (!core && pu) { hwloc_obj_t cache = hwloc_get_shared_cache_covering_obj (pocl_topology, pu); if ((cache) && (cache->attr)) { cacheline_size = cache->attr->cache.linesize; cache_size = cache->attr->cache.size; } } if (!cache_size || !cacheline_size) break; device->global_mem_cache_type = 0x2; // CL_READ_WRITE_CACHE, without including all of CL/cl.h device->global_mem_cacheline_size = cacheline_size; device->global_mem_cache_size = cache_size; } while (0); // Destroy topology object and return exit_destroy: hwloc_topology_destroy (pocl_topology); return ret; }