static void report_duplicated_guid(IN osm_sm_t * sm, osm_physp_t * p_physp,
				   osm_node_t * p_neighbor_node,
				   const uint8_t port_num)
{
	osm_physp_t *p_old, *p_new;
	osm_dr_path_t path;

	p_old = p_physp->p_remote_physp;
	p_new = osm_node_get_physp_ptr(p_neighbor_node, port_num);

	OSM_LOG(sm->p_log, OSM_LOG_SYS | OSM_LOG_ERROR, "ERR 0D01: "
		"Found duplicated node GUID.\n"
		"Node 0x%" PRIx64 " port %u is reachable from remote node "
		"0x%" PRIx64 " port %u and remote node 0x%" PRIx64 " port %u.\n"
		"Paths are:\n",
		cl_ntoh64(p_physp->p_node->node_info.node_guid),
		p_physp->port_num,
		p_old ? cl_ntoh64(p_old->p_node->node_info.node_guid) : 0,
		p_old ? p_old->port_num : 0,
		p_new ? cl_ntoh64(p_new->p_node->node_info.node_guid) : 0,
		p_new ? p_new->port_num : 0);

	osm_dump_dr_path_v2(sm->p_log, osm_physp_get_dr_path_ptr(p_physp),
			    FILE_ID, OSM_LOG_ERROR);

	path = *osm_physp_get_dr_path_ptr(p_new);
	if (osm_dr_path_extend(&path, port_num))
		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D05: "
			"DR path with hop count %d couldn't be extended\n",
			path.hop_count);
	osm_dump_dr_path_v2(sm->p_log, &path, FILE_ID, OSM_LOG_ERROR);
}
/**********************************************************************
 The plock must NOT be held before calling this function.
**********************************************************************/
static void ni_rcv_process_new(IN osm_sm_t * sm, IN const osm_madw_t * p_madw)
{
	osm_node_t *p_node;
	osm_node_t *p_node_check;
	osm_port_t *p_port;
	osm_port_t *p_port_check;
	osm_router_t *p_rtr = NULL;
	osm_router_t *p_rtr_check;
	cl_qmap_t *p_rtr_guid_tbl;
	ib_node_info_t *p_ni;
	ib_smp_t *p_smp;
	osm_ni_context_t *p_ni_context;
	osm_alias_guid_t *p_alias_guid, *p_alias_guid_check;
	uint8_t port_num;

	OSM_LOG_ENTER(sm->p_log);

	p_smp = osm_madw_get_smp_ptr(p_madw);
	p_ni = ib_smp_get_payload_ptr(p_smp);
	p_ni_context = osm_madw_get_ni_context_ptr(p_madw);
	port_num = ib_node_info_get_local_port_num(p_ni);

	osm_dump_smp_dr_path_v2(sm->p_log, p_smp, FILE_ID, OSM_LOG_VERBOSE);

	OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
		"Discovered new %s node,"
		"\n\t\t\t\tGUID 0x%" PRIx64 ", TID 0x%" PRIx64 "\n",
		ib_get_node_type_str(p_ni->node_type),
		cl_ntoh64(p_ni->node_guid), cl_ntoh64(p_smp->trans_id));

	if (PF(port_num > p_ni->num_ports)) {
		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D0A: "
			"New %s node GUID 0x%" PRIx64 "is non-compliant and "
			"is being ignored since the "
			"local port num %u > num ports %u\n",
			ib_get_node_type_str(p_ni->node_type),
			cl_ntoh64(p_ni->node_guid), port_num,
			p_ni->num_ports);
		goto Exit;
	}

	p_node = osm_node_new(p_madw);
	if (PF(p_node == NULL)) {
		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D07: "
			"Unable to create new node object\n");
		goto Exit;
	}

	/*
	   Create a new port object to represent this node's physical
	   ports in the port table.
	 */
	p_port = osm_port_new(p_ni, p_node);
	if (PF(p_port == NULL)) {
		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D14: "
			"Unable to create new port object\n");
		osm_node_delete(&p_node);
		goto Exit;
	}

	/*
	   Add the new port object to the database.
	 */
	p_port_check =
	    (osm_port_t *) cl_qmap_insert(&sm->p_subn->port_guid_tbl,
					  p_ni->port_guid, &p_port->map_item);
	if (PF(p_port_check != p_port)) {
		/*
		   We should never be here!
		   Somehow, this port GUID already exists in the table.
		 */
		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D15: "
			"Duplicate Port GUID 0x%" PRIx64
			"! Found by the two directed routes:\n",
			cl_ntoh64(p_ni->port_guid));
		osm_dump_dr_path_v2(sm->p_log,
				    osm_physp_get_dr_path_ptr(p_port->p_physp),
				    FILE_ID, OSM_LOG_ERROR);
		osm_dump_dr_path_v2(sm->p_log,
				    osm_physp_get_dr_path_ptr(p_port_check->
							   p_physp),
				    FILE_ID, OSM_LOG_ERROR);
		osm_port_delete(&p_port);
		osm_node_delete(&p_node);
		goto Exit;
	}

	p_alias_guid = osm_alias_guid_new(p_ni->port_guid,
					  p_port);
	if (PF(!p_alias_guid)) {
		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D18: "
			"alias guid memory allocation failed"
			" for port GUID 0x%" PRIx64 "\n",
			cl_ntoh64(p_ni->port_guid));
		goto alias_done2;
	}

	/* insert into alias guid table */
	p_alias_guid_check =
		(osm_alias_guid_t *) cl_qmap_insert(&sm->p_subn->alias_port_guid_tbl,
						    p_alias_guid->alias_guid,
						    &p_alias_guid->map_item);
	if (p_alias_guid_check != p_alias_guid) {
		/* alias GUID is a duplicate */
		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D19: "
			"Duplicate alias port GUID 0x%" PRIx64 "\n",
			cl_ntoh64(p_ni->port_guid));
		osm_alias_guid_delete(&p_alias_guid);
	}

alias_done2:
	/* If we are a master, then this means the port is new on the subnet.
	   Mark it as new - need to send trap 64 on these ports.
	   The condition that we are master is true, since if we are in discovering
	   state (meaning we woke up from standby or we are just initializing),
	   then these ports may be new to us, but are not new on the subnet.
	   If we are master, then the subnet as we know it is the updated one,
	   and any new ports we encounter should cause trap 64. C14-72.1.1 */
	if (sm->p_subn->sm_state == IB_SMINFO_STATE_MASTER)
		p_port->is_new = 1;

	/* If there were RouterInfo or other router attribute,
	   this would be elsewhere */
	if (p_ni->node_type == IB_NODE_TYPE_ROUTER) {
		if (PF((p_rtr = osm_router_new(p_port)) == NULL))
			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D1A: "
				"Unable to create new router object\n");
		else {
			p_rtr_guid_tbl = &sm->p_subn->rtr_guid_tbl;
			p_rtr_check =
			    (osm_router_t *) cl_qmap_insert(p_rtr_guid_tbl,
							    p_ni->port_guid,
							    &p_rtr->map_item);
			if (PF(p_rtr_check != p_rtr))
				OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D1B: "
					"Unable to add port GUID:0x%016" PRIx64
					" to router table\n",
					cl_ntoh64(p_ni->port_guid));
		}
	}

	p_node_check =
	    (osm_node_t *) cl_qmap_insert(&sm->p_subn->node_guid_tbl,
					  p_ni->node_guid, &p_node->map_item);
	if (PF(p_node_check != p_node)) {
		/*
		   This node must have been inserted by another thread.
		   This is unexpected, but is not an error.
		   We can simply clean-up, since the other thread will
		   see this processing through to completion.
		 */
		OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
			"Discovery race detected at node 0x%" PRIx64 "\n",
			cl_ntoh64(p_ni->node_guid));
		osm_node_delete(&p_node);
		p_node = p_node_check;
		ni_rcv_set_links(sm, p_node, port_num, p_ni_context);
		goto Exit;
	} else
		ni_rcv_set_links(sm, p_node, port_num, p_ni_context);

	p_node->discovery_count++;
	ni_rcv_get_node_desc(sm, p_node, p_madw);

	switch (p_ni->node_type) {
	case IB_NODE_TYPE_CA:
	case IB_NODE_TYPE_ROUTER:
		ni_rcv_process_new_ca_or_router(sm, p_node, p_madw);
		break;
	case IB_NODE_TYPE_SWITCH:
		ni_rcv_process_new_switch(sm, p_node, p_madw);
		break;
	default:
		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D16: "
			"Unknown node type %u with GUID 0x%" PRIx64 "\n",
			p_ni->node_type, cl_ntoh64(p_ni->node_guid));
		break;
	}

Exit:
	OSM_LOG_EXIT(sm->p_log);
}
/**********************************************************************
 Initiates a lightweight sweep of the subnet.
 Used during normal sweeps after the subnet is up.
**********************************************************************/
static ib_api_status_t state_mgr_light_sweep_start(IN osm_sm_t * sm)
{
	ib_api_status_t status = IB_SUCCESS;
	osm_bind_handle_t h_bind;
	cl_qmap_t *p_sw_tbl;
	cl_map_item_t *p_next;
	osm_node_t *p_node;
	osm_physp_t *p_physp;
	uint8_t port_num;

	OSM_LOG_ENTER(sm->p_log);

	p_sw_tbl = &sm->p_subn->sw_guid_tbl;

	/*
	 * First, get the bind handle.
	 */
	h_bind = osm_sm_mad_ctrl_get_bind_handle(&sm->mad_ctrl);
	if (h_bind == OSM_BIND_INVALID_HANDLE) {
		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
			"No bound ports. Deferring sweep...\n");
		status = IB_INVALID_STATE;
		goto _exit;
	}

	OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, "INITIATING LIGHT SWEEP");
	CL_PLOCK_ACQUIRE(sm->p_lock);
	cl_qmap_apply_func(p_sw_tbl, state_mgr_get_sw_info, sm);
	CL_PLOCK_RELEASE(sm->p_lock);

	CL_PLOCK_ACQUIRE(sm->p_lock);
	cl_qmap_apply_func(&sm->p_subn->node_guid_tbl, state_mgr_get_node_desc,
			   sm);
	CL_PLOCK_RELEASE(sm->p_lock);

	/* now scan the list of physical ports that were not down but have no remote port */
	CL_PLOCK_ACQUIRE(sm->p_lock);
	p_next = cl_qmap_head(&sm->p_subn->node_guid_tbl);
	while (p_next != cl_qmap_end(&sm->p_subn->node_guid_tbl)) {
		p_node = (osm_node_t *) p_next;
		p_next = cl_qmap_next(p_next);

		for (port_num = 1; port_num < osm_node_get_num_physp(p_node);
		     port_num++) {
			p_physp = osm_node_get_physp_ptr(p_node, port_num);
			if (p_physp && (osm_physp_get_port_state(p_physp) !=
					IB_LINK_DOWN)
			    && !osm_physp_get_remote(p_physp)) {
				OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 3315: "
					"Unknown remote side for node 0x%016"
					PRIx64
					" (%s) port %u. Adding to light sweep sampling list\n",
					cl_ntoh64(osm_node_get_node_guid
						  (p_node)),
					p_node->print_desc, port_num);

				osm_dump_dr_path_v2(sm->p_log,
						    osm_physp_get_dr_path_ptr
						    (p_physp), FILE_ID, OSM_LOG_ERROR);

				state_mgr_get_remote_port_info(sm, p_physp);
			}
		}
	}

	cl_qmap_apply_func(&sm->p_subn->sm_guid_tbl, query_sm_info, sm);

	CL_PLOCK_RELEASE(sm->p_lock);

_exit:
	OSM_LOG_EXIT(sm->p_log);
	return status;
}
/**********************************************************************
 The plock must be held before calling this function.
**********************************************************************/
static void ni_rcv_set_links(IN osm_sm_t * sm, osm_node_t * p_node,
			     const uint8_t port_num,
			     const osm_ni_context_t * p_ni_context)
{
	osm_node_t *p_neighbor_node;
	osm_physp_t *p_physp, *p_remote_physp;

	OSM_LOG_ENTER(sm->p_log);

	/*
	   A special case exists in which the node we're trying to
	   link is our own node.  In this case, the guid value in
	   the ni_context will be zero.
	 */
	if (p_ni_context->node_guid == 0) {
		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
			"Nothing to link for our own node 0x%" PRIx64 "\n",
			cl_ntoh64(osm_node_get_node_guid(p_node)));
		goto _exit;
	}

	p_neighbor_node = osm_get_node_by_guid(sm->p_subn,
					       p_ni_context->node_guid);
	if (PF(!p_neighbor_node)) {
		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0D10: "
			"Unexpected removal of neighbor node 0x%" PRIx64 "\n",
			cl_ntoh64(p_ni_context->node_guid));
		goto _exit;
	}

	/* When setting the link, ports on both
	   sides of the link should be initialized */
	CL_ASSERT(osm_node_link_has_valid_ports(p_node, port_num,
						p_neighbor_node,
						p_ni_context->port_num));

	if (osm_node_link_exists(p_node, port_num,
				 p_neighbor_node, p_ni_context->port_num)) {
		OSM_LOG(sm->p_log, OSM_LOG_DEBUG, "Link already exists\n");
		goto _exit;
	}

	p_physp = osm_node_get_physp_ptr(p_node, port_num);
	if (!p_physp) {
		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR OD0E: "
			"Failed to find physp for port %d of Node GUID 0x%"
			PRIx64 "\n", port_num,
			cl_ntoh64(osm_node_get_node_guid(p_node)));
		goto _exit;
	}

	/*
	 * If the link went UP, after we already discovered it, we shouldn't
	 * set the link between the ports and resweep.
	 */
	if (osm_physp_get_port_state(p_physp) == IB_LINK_DOWN &&
	    p_node->physp_discovered[port_num]) {
		/* Link down on another side. Don't create a link*/
		p_node->physp_discovered[port_num] = 0;
		sm->p_subn->force_heavy_sweep = TRUE;
		goto _exit;
	}

	if (osm_node_has_any_link(p_node, port_num) &&
	    sm->p_subn->force_heavy_sweep == FALSE &&
	    (!p_ni_context->dup_count ||
	     (p_ni_context->dup_node_guid == osm_node_get_node_guid(p_node) &&
	      p_ni_context->dup_port_num == port_num))) {
		/*
		   Uh oh...
		   This could be reconnected ports, but also duplicated GUID
		   (2 nodes have the same guid) or a 12x link with lane reversal
		   that is not configured correctly.
		   We will try to recover by querying NodeInfo again.
		   In order to catch even fast port moving to new location(s)
		   and back we will count up to 5.
		   Some crazy reconnections (newly created switch loop right
		   before targeted CA) will not be catched this way. So in worst
		   case - report GUID duplication and request new discovery.
		   When switch node is targeted NodeInfo querying will be done
		   in opposite order, this is much stronger check, unfortunately
		   it is impossible with CAs.
		 */
		p_physp = osm_node_get_physp_ptr(p_node, port_num);
		if (!p_physp) {
			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR OD0F: "
				"Failed to find physp for port %d of Node GUID 0x%"
				PRIx64 "\n", port_num,
				cl_ntoh64(osm_node_get_node_guid(p_node)));
			goto _exit;
		}

		if (p_ni_context->dup_count > 5) {
			report_duplicated_guid(sm, p_physp, p_neighbor_node,
					       p_ni_context->port_num);
			sm->p_subn->force_heavy_sweep = TRUE;
		} else if (p_node->sw)
			requery_dup_node_info(sm, p_physp->p_remote_physp,
					      p_ni_context->dup_count + 1);
		else
			requery_dup_node_info(sm, p_physp,
					      p_ni_context->dup_count + 1);
	}

	/*
	   When there are only two nodes with exact same guids (connected back
	   to back) - the previous check for duplicated guid will not catch
	   them. But the link will be from the port to itself...
	   Enhanced Port 0 is an exception to this
	 */
	if (osm_node_get_node_guid(p_node) == p_ni_context->node_guid &&
	    port_num == p_ni_context->port_num &&
	    port_num != 0 && cl_qmap_count(&sm->p_subn->sw_guid_tbl) == 0) {
		OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
			"Duplicate GUID found by link from a port to itself:"
			"node 0x%" PRIx64 ", port number %u\n",
			cl_ntoh64(osm_node_get_node_guid(p_node)), port_num);
		p_physp = osm_node_get_physp_ptr(p_node, port_num);
		if (!p_physp) {
			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR OD1D: "
				"Failed to find physp for port %d of Node GUID 0x%"
				PRIx64 "\n", port_num,
				cl_ntoh64(osm_node_get_node_guid(p_node)));
			goto _exit;
		}

		osm_dump_dr_path_v2(sm->p_log, osm_physp_get_dr_path_ptr(p_physp),
				    FILE_ID, OSM_LOG_VERBOSE);

		if (sm->p_subn->opt.exit_on_fatal == TRUE) {
			osm_log_v2(sm->p_log, OSM_LOG_SYS, FILE_ID,
				   "Errors on subnet. Duplicate GUID found "
				   "by link from a port to itself. "
				   "See verbose opensm.log for more details\n");
			exit(1);
		}
	}

	OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
		"Creating new link between:\n\t\t\t\tnode 0x%" PRIx64
		", port number %u and\n\t\t\t\tnode 0x%" PRIx64
		", port number %u\n",
		cl_ntoh64(osm_node_get_node_guid(p_node)), port_num,
		cl_ntoh64(p_ni_context->node_guid), p_ni_context->port_num);

	if (sm->ucast_mgr.cache_valid)
		osm_ucast_cache_check_new_link(&sm->ucast_mgr, p_node, port_num,
					       p_neighbor_node,
					       p_ni_context->port_num);

	p_physp = osm_node_get_physp_ptr(p_node, port_num);
	p_remote_physp = osm_node_get_physp_ptr(p_neighbor_node,
						p_ni_context->port_num);
	if (!p_physp || !p_remote_physp)
		goto _exit;

	osm_node_link(p_node, port_num, p_neighbor_node, p_ni_context->port_num);

	osm_db_neighbor_set(sm->p_subn->p_neighbor,
			    cl_ntoh64(osm_physp_get_port_guid(p_physp)),
			    port_num,
			    cl_ntoh64(osm_physp_get_port_guid(p_remote_physp)),
			    p_ni_context->port_num);
	osm_db_neighbor_set(sm->p_subn->p_neighbor,
			    cl_ntoh64(osm_physp_get_port_guid(p_remote_physp)),
			    p_ni_context->port_num,
			    cl_ntoh64(osm_physp_get_port_guid(p_physp)),
			    port_num);

_exit:
	OSM_LOG_EXIT(sm->p_log);
}
Exemple #5
0
/**********************************************************************
  The plock must be held before calling this function.
**********************************************************************/
static ib_net64_t req_determine_mkey(IN osm_sm_t * sm,
				     IN const osm_dr_path_t * p_path)
{
	osm_node_t *p_node;
	osm_port_t *p_sm_port;
	osm_physp_t *p_physp;
	ib_net64_t dest_port_guid = 0, m_key;
	uint8_t hop;

	OSM_LOG_ENTER(sm->p_log);

	p_physp = NULL;

	p_sm_port = osm_get_port_by_guid(sm->p_subn, sm->p_subn->sm_port_guid);

	/* hop_count == 0: destination port guid is SM */
	if (p_path->hop_count == 0) {
		dest_port_guid = sm->p_subn->sm_port_guid;
		goto Remote_Guid;
	}

	if (p_sm_port) {
		p_node = p_sm_port->p_node;
		if (osm_node_get_type(p_node) == IB_NODE_TYPE_SWITCH)
			p_physp = osm_node_get_physp_ptr(p_node, p_path->path[1]);
		else
			p_physp = p_sm_port->p_physp;
	}

	/* hop_count == 1: outgoing physp is SM physp */
	for (hop = 2; p_physp && hop <= p_path->hop_count; hop++) {
		p_physp = p_physp->p_remote_physp;
		if (!p_physp)
			break;
		p_node = p_physp->p_node;
		p_physp = osm_node_get_physp_ptr(p_node, p_path->path[hop]);
	}

	/* At this point, p_physp points at the outgoing physp on the
	   last hop, or NULL if we don't know it.
	*/
	if (!p_physp) {
		OSM_LOG(sm->p_log, OSM_LOG_ERROR,
			"ERR 1107: Outgoing physp is null on non-hop_0!\n");
		osm_dump_dr_path_v2(sm->p_log, p_path, FILE_ID, OSM_LOG_ERROR);
		dest_port_guid = 0;
		goto Remote_Guid;
	}

	if (p_physp->p_remote_physp) {
		dest_port_guid = p_physp->p_remote_physp->port_guid;
		goto Remote_Guid;
	}

	OSM_LOG(sm->p_log, OSM_LOG_DEBUG, "Target port guid unknown, "
		"using persistent DB\n");
	if (!osm_db_neighbor_get(sm->p_subn->p_neighbor,
				 cl_ntoh64(p_physp->port_guid),
				 p_physp->port_num,
				 &dest_port_guid, NULL)) {
		dest_port_guid = cl_hton64(dest_port_guid);
	}

Remote_Guid:
	if (dest_port_guid) {
		if (!osm_db_guid2mkey_get(sm->p_subn->p_g2m,
					  cl_ntoh64(dest_port_guid), &m_key)) {
			m_key = cl_hton64(m_key);
			OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
				"Found mkey for guid 0x%"
				PRIx64 "\n", cl_ntoh64(dest_port_guid));
		} else {
			OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
				"Target port mkey unknown, using default\n");
			m_key = sm->p_subn->opt.m_key;
		}
	} else {
		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
			"Target port guid unknown, using default\n");
		m_key = sm->p_subn->opt.m_key;
	}

	OSM_LOG_EXIT(sm->p_log);

	return m_key;
}