bool sockinfo::detach_receiver(flow_tuple_with_local_if &flow_key) { si_logdbg("Unregistering receiver: %s", flow_key.to_str()); // TODO ALEXR: DO we need to return a 3 tuple instead of a 5 tuple being removed? // if (peer_ip != INADDR_ANY && peer_port != INPORT_ANY); // Find ring associated with this tuple rx_flow_map_t::iterator rx_flow_iter = m_rx_flow_map.find(flow_key); BULLSEYE_EXCLUDE_BLOCK_START if (rx_flow_iter == m_rx_flow_map.end()) { si_logdbg("Failed to find ring associated with: %s", flow_key.to_str()); return false; } BULLSEYE_EXCLUDE_BLOCK_END ring* p_ring = rx_flow_iter->second; si_logdbg("Detaching %s from ring %p", flow_key.to_str(), p_ring); // Detach tuple unlock_rx_q(); p_ring->detach_flow(flow_key, this); lock_rx_q(); // Un-map flow from local map #ifndef DEFINED_SOCKETXTREME // is not defined rx_del_ring_cb(flow_key, p_ring); #endif // DEFINED_SOCKETXTREME m_rx_flow_map.erase(rx_flow_iter); return destroy_nd_resources((const ip_address)flow_key.get_local_if()); }
int sockinfo::ioctl(unsigned long int __request, unsigned long int __arg) throw (vma_error) { int *p_arg = (int *)__arg; switch (__request) { case FIONBIO: { si_logdbg("request=FIONBIO, arg=%d", *p_arg); if (*p_arg) set_blocking(false); else set_blocking(true); } break; default: char buf[128]; snprintf(buf, sizeof(buf), "unimplemented ioctl request=%#x, flags=%#x", (unsigned)__request, (unsigned)__arg); buf[ sizeof(buf)-1 ] = '\0'; VLOG_PRINTF_INFO(safe_mce_sys().exception_handling.get_log_severity(), "%s", buf); int rc = handle_exception_flow(); switch (rc) { case -1: return rc; case -2: vma_throw_object_with_msg(vma_unsupported_api, buf); } break; } si_logdbg("going to OS for ioctl request=%d, flags=%x", __request, __arg); return orig_os_api.ioctl(m_fd, __request, __arg); }
net_device_resources_t* sockinfo::create_nd_resources(const ip_address ip_local) { net_device_resources_t* p_nd_resources = NULL; // Check if we are already registered to net_device with the local ip as observers rx_net_device_map_t::iterator rx_nd_iter = m_rx_nd_map.find(ip_local.get_in_addr()); if (rx_nd_iter == m_rx_nd_map.end()) { // Need to register as observer to net_device net_device_resources_t nd_resources; nd_resources.refcnt = 0; nd_resources.p_nde = NULL; nd_resources.p_ndv = NULL; nd_resources.p_ring = NULL; BULLSEYE_EXCLUDE_BLOCK_START cache_entry_subject<ip_address, net_device_val*>* p_ces = NULL; if (!g_p_net_device_table_mgr->register_observer(ip_local, &m_rx_nd_observer, &p_ces)) { si_logdbg("Failed registering as observer for local ip %s", ip_local.to_str().c_str()); goto err; } nd_resources.p_nde = (net_device_entry*)p_ces; if (!nd_resources.p_nde) { si_logerr("Got NULL net_devide_entry for local ip %s", ip_local.to_str().c_str()); goto err; } if (!nd_resources.p_nde->get_val(nd_resources.p_ndv)) { si_logerr("Got net_device_val=NULL (interface is not offloaded) for local ip %s", ip_local.to_str().c_str()); goto err; } unlock_rx_q(); m_rx_ring_map_lock.lock(); resource_allocation_key *key; if (m_rx_ring_map.size() && m_ring_alloc_logic.is_logic_support_migration()) { key = m_ring_alloc_logic.get_key(); } else { key = m_ring_alloc_logic.create_new_key(ip_local.get_in_addr()); } nd_resources.p_ring = nd_resources.p_ndv->reserve_ring(key); m_rx_ring_map_lock.unlock(); lock_rx_q(); if (!nd_resources.p_ring) { si_logdbg("Failed to reserve ring for allocation key %s on ip %s", m_ring_alloc_logic.get_key()->to_str(), ip_local.to_str().c_str()); goto err; } // Add new net_device to rx_map m_rx_nd_map[ip_local.get_in_addr()] = nd_resources; rx_nd_iter = m_rx_nd_map.find(ip_local.get_in_addr()); if (rx_nd_iter == m_rx_nd_map.end()) { si_logerr("Failed to find rx_nd_iter"); goto err; } BULLSEYE_EXCLUDE_BLOCK_END }
void sockinfo::set_blocking(bool is_blocked) { if (is_blocked) { si_logdbg("set socket to blocked mode"); m_b_blocking = true; } else { si_logdbg("set socket to non-blocking mode"); m_b_blocking = false; } // Update statistics info m_p_socket_stats->b_blocking = m_b_blocking; }
int sockinfo::getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen) { int ret = -1; switch (__level) { case SOL_SOCKET: switch(__optname) { case SO_VMA_USER_DATA: if (*__optlen == sizeof(m_fd_context)) { *(void **)__optval = m_fd_context; ret = 0; } else { errno = EINVAL; } break; case SO_VMA_FLOW_TAG: if (*__optlen >= sizeof(uint32_t)) { *(uint32_t*)__optval = m_flow_tag_id; ret = 0; } else { errno = EINVAL; } break; case SO_MAX_PACING_RATE: if (*__optlen == sizeof(struct vma_rate_limit_t)) { *(struct vma_rate_limit_t*)__optval = m_so_ratelimit; *__optlen = sizeof(struct vma_rate_limit_t); si_logdbg("(SO_MAX_PACING_RATE) value: %d, %d, %d", (*(struct vma_rate_limit_t*)__optval).rate, (*(struct vma_rate_limit_t*)__optval).max_burst_sz, (*(struct vma_rate_limit_t*)__optval).typical_pkt_sz); } else if (*__optlen == sizeof(uint32_t)) { *(uint32_t*)__optval = KB_TO_BYTE(m_so_ratelimit.rate); *__optlen = sizeof(uint32_t); si_logdbg("(SO_MAX_PACING_RATE) value: %d", *(int *)__optval); ret = 0; } else { errno = EINVAL; } break; } } return ret; }
int sockinfo::fcntl(int __cmd, unsigned long int __arg) { switch (__cmd) { case F_SETFL: { si_logdbg("cmd=F_SETFL, arg=%#x", __arg); if (__arg & O_NONBLOCK) set_blocking(false); else set_blocking(true); } break; case F_GETFL: /* Get file status flags. */ si_logfunc("cmd=F_GETFL, arg=%#x", __arg); break; case F_GETFD: /* Get file descriptor flags. */ si_logfunc("cmd=F_GETFD, arg=%#x", __arg); break; case F_SETFD: /* Set file descriptor flags. */ si_logfunc("cmd=F_SETFD, arg=%#x", __arg); break; default: char buf[128]; snprintf(buf, sizeof(buf), "unimplemented fcntl cmd=%#x, arg=%#x", (unsigned)__cmd, (unsigned)__arg); buf[ sizeof(buf)-1 ] = '\0'; VLOG_PRINTF_INFO(safe_mce_sys().exception_handling.get_log_severity(), "%s", buf); int rc = handle_exception_flow(); switch (rc) { case -1: return rc; case -2: vma_throw_object_with_msg(vma_unsupported_api, buf); } break; } si_logdbg("going to OS for fcntl cmd=%d, arg=%#x", __cmd, __arg); return orig_os_api.fcntl(m_fd, __cmd, __arg); }
bool sockinfo::attach_receiver(flow_tuple_with_local_if &flow_key) { // This function should be called from within mutex protected context of the sockinfo!!! si_logdbg("Attaching to %s", flow_key.to_str()); // Protect against local loopback used as local_if & peer_ip // rdma_cm will accept it but we don't want to offload it if (flow_key.is_local_loopback()) { si_logdbg("VMA does not offload local loopback IP address"); return false; } if (m_rx_flow_map.find(flow_key) != m_rx_flow_map.end()) { si_logdbg("already attached %s", flow_key.to_str()); return false; } // Allocate resources on specific interface (create ring) net_device_resources_t* p_nd_resources = create_nd_resources((const ip_address)flow_key.get_local_if()); if (NULL == p_nd_resources) { // any error which occurred inside create_nd_resources() was already printed. No need to reprint errors here return false; } // Map flow in local map m_rx_flow_map[flow_key] = p_nd_resources->p_ring; #ifndef DEFINED_SOCKETXTREME // is not defined // Save the new CQ from ring rx_add_ring_cb(flow_key, p_nd_resources->p_ring); #endif // DEFINED_SOCKETXTREME // Attach tuple BULLSEYE_EXCLUDE_BLOCK_START unlock_rx_q(); if (!p_nd_resources->p_ring->attach_flow(flow_key, this)) { lock_rx_q(); si_logdbg("Failed to attach %s to ring %p", flow_key.to_str(), p_nd_resources->p_ring); return false; } set_rx_packet_processor(); lock_rx_q(); BULLSEYE_EXCLUDE_BLOCK_END // Registered as receiver successfully si_logdbg("Attached %s to ring %p", flow_key.to_str(), p_nd_resources->p_ring); // Verify 5 tuple over 3 tuple if (flow_key.is_5_tuple()) { // Check and remove lesser 3 tuple flow_tuple_with_local_if flow_key_3t(flow_key.get_dst_ip(), flow_key.get_dst_port(), INADDR_ANY, INPORT_ANY, flow_key.get_protocol(), flow_key.get_local_if()); rx_flow_map_t::iterator rx_flow_iter = m_rx_flow_map.find(flow_key_3t); if (rx_flow_iter != m_rx_flow_map.end()) { si_logdbg("Removing (and detaching) 3 tuple now that we added a stronger 5 tuple"); detach_receiver(flow_key_3t); } } return true; }
int sockinfo::setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen) { int ret = SOCKOPT_PASS_TO_OS; if (__level == SOL_SOCKET) { switch(__optname) { case SO_VMA_USER_DATA: if (__optlen == sizeof(m_fd_context)) { m_fd_context = *(void **)__optval; ret = SOCKOPT_INTERNAL_VMA_SUPPORT; } else { ret = SOCKOPT_NO_VMA_SUPPORT; errno = EINVAL; } break; case SO_VMA_RING_USER_MEMORY: if (__optval) { if (__optlen == sizeof(iovec)) { iovec *attr = (iovec *)__optval; m_ring_alloc_log_rx.set_memory_descriptor(*attr); m_ring_alloc_logic = ring_allocation_logic_rx(get_fd(), m_ring_alloc_log_rx, this); if (m_p_rx_ring || m_rx_ring_map.size()) { si_logwarn("user asked to assign memory for " "RX ring but ring already exists"); } ret = SOCKOPT_INTERNAL_VMA_SUPPORT; } else { ret = SOCKOPT_NO_VMA_SUPPORT; errno = EINVAL; si_logdbg("SOL_SOCKET, SO_VMA_RING_USER_MEMORY - " "bad length expected %d got %d", sizeof(iovec), __optlen); } } else { ret = SOCKOPT_NO_VMA_SUPPORT; errno = EINVAL; si_logdbg("SOL_SOCKET, SO_VMA_RING_USER_MEMORY - NOT HANDLED, optval == NULL"); } break; case SO_VMA_FLOW_TAG: if (__optval) { if (__optlen == sizeof(uint32_t)) { if (set_flow_tag(*(uint32_t*)__optval)) { si_logdbg("SO_VMA_FLOW_TAG, set " "socket %s to flow id %d", m_fd, m_flow_tag_id); // not supported in OS ret = SOCKOPT_INTERNAL_VMA_SUPPORT; } else { ret = SOCKOPT_NO_VMA_SUPPORT; errno = EINVAL; } } else { ret = SOCKOPT_NO_VMA_SUPPORT; errno = EINVAL; si_logdbg("SO_VMA_FLOW_TAG, bad length " "expected %d got %d", sizeof(uint32_t), __optlen); break; } } else { ret = SOCKOPT_NO_VMA_SUPPORT; errno = EINVAL; si_logdbg("SO_VMA_FLOW_TAG - NOT HANDLED, " "optval == NULL"); } break; case SO_TIMESTAMP: case SO_TIMESTAMPNS: if (__optval) { m_b_rcvtstamp = *(bool*)__optval; if (__optname == SO_TIMESTAMPNS) m_b_rcvtstampns = m_b_rcvtstamp; si_logdbg("SOL_SOCKET, %s=%s", setsockopt_so_opt_to_str(__optname), (m_b_rcvtstamp ? "true" : "false")); } else { si_logdbg("SOL_SOCKET, %s=\"???\" - NOT HANDLED, optval == NULL", setsockopt_so_opt_to_str(__optname)); } break; case SO_TIMESTAMPING: if (__optval) { uint8_t val = *(uint8_t*)__optval; // SOF_TIMESTAMPING_TX_SOFTWARE and SOF_TIMESTAMPING_TX_HARDWARE is NOT supported. if (val & (SOF_TIMESTAMPING_TX_SOFTWARE | SOF_TIMESTAMPING_TX_HARDWARE)) { ret = SOCKOPT_NO_VMA_SUPPORT; errno = EOPNOTSUPP; si_logdbg("SOL_SOCKET, SOF_TIMESTAMPING_TX_SOFTWARE and SOF_TIMESTAMPING_TX_HARDWARE is not supported, errno set to EOPNOTSUPP"); } if (val & (SOF_TIMESTAMPING_RAW_HARDWARE | SOF_TIMESTAMPING_RX_HARDWARE)) { if (g_p_ib_ctx_handler_collection->get_ctx_time_conversion_mode() == TS_CONVERSION_MODE_DISABLE){ if (safe_mce_sys().hw_ts_conversion_mode == TS_CONVERSION_MODE_DISABLE) { ret = SOCKOPT_NO_VMA_SUPPORT; errno = EPERM; si_logdbg("SOL_SOCKET, SOF_TIMESTAMPING_RAW_HARDWARE and SOF_TIMESTAMPING_RX_HARDWARE socket options were disabled (VMA_HW_TS_CONVERSION = %d) , errno set to EPERM", TS_CONVERSION_MODE_DISABLE); } else { ret = SOCKOPT_NO_VMA_SUPPORT; errno = ENODEV; si_logdbg("SOL_SOCKET, SOF_TIMESTAMPING_RAW_HARDWARE and SOF_TIMESTAMPING_RX_HARDWARE is not supported by device(s), errno set to ENODEV"); } } } m_n_tsing_flags = val; si_logdbg("SOL_SOCKET, SO_TIMESTAMPING=%u", m_n_tsing_flags); } else { si_logdbg("SOL_SOCKET, %s=\"???\" - NOT HANDLED, optval == NULL", setsockopt_so_opt_to_str(__optname)); } break; default: break; } } else if (__level == IPPROTO_IP) { switch(__optname) { case IP_TTL: if (__optlen < sizeof(m_n_uc_ttl)) { ret = SOCKOPT_NO_VMA_SUPPORT; errno = EINVAL; } else { int val = __optlen < sizeof(val) ? (uint8_t) *(uint8_t *)__optval : (int) *(int *)__optval; if (val != -1 && (val < 1 || val > 255)) { ret = SOCKOPT_NO_VMA_SUPPORT; errno = EINVAL; } else { m_n_uc_ttl = (val == -1) ? safe_mce_sys().sysctl_reader.get_net_ipv4_ttl() : (uint8_t) val; header_ttl_updater du(m_n_uc_ttl, false); update_header_field(&du); si_logdbg("IPPROTO_IP, optname=IP_TTL (%d)", m_n_uc_ttl); } } break; default: break; } } si_logdbg("ret (%d)", ret); return ret; }
bool sockinfo::attach_receiver(flow_tuple_with_local_if &flow_key) { // This function should be called from within mutex protected context of the sockinfo!!! si_logdbg("Attaching to %s", flow_key.to_str()); // Protect against local loopback used as local_if & peer_ip // rdma_cm will accept it but we don't want to offload it if (flow_key.is_local_loopback()) { si_logdbg("VMA does not offload local loopback IP address"); return false; } if (m_rx_flow_map.find(flow_key) != m_rx_flow_map.end()) { si_logdbg("already attached %s", flow_key.to_str()); return false; } net_device_resources_t* p_nd_resources = NULL; // Check if we are already registered to net_device with the local ip as observers ip_address ip_local(flow_key.get_local_if()); rx_net_device_map_t::iterator rx_nd_iter = m_rx_nd_map.find(ip_local.get_in_addr()); if (rx_nd_iter == m_rx_nd_map.end()) { // Need to register as observer to net_device net_device_resources_t nd_resources; nd_resources.refcnt = 0; nd_resources.p_nde = NULL; nd_resources.p_ndv = NULL; nd_resources.p_ring = NULL; BULLSEYE_EXCLUDE_BLOCK_START cache_entry_subject<ip_address, net_device_val*>* p_ces = NULL; if (!g_p_net_device_table_mgr->register_observer(ip_local, &m_rx_nd_observer, &p_ces)) { si_logdbg("Failed registering as observer for local ip %s", ip_local.to_str().c_str()); return false; } nd_resources.p_nde = (net_device_entry*)p_ces; if (!nd_resources.p_nde) { si_logerr("Got NULL net_devide_entry for local ip %s", ip_local.to_str().c_str()); return false; } if (!nd_resources.p_nde->get_val(nd_resources.p_ndv)) { si_logerr("Got net_device_val=NULL (interface is not offloaded) for local ip %s", ip_local.to_str().c_str()); return false; } unlock_rx_q(); m_rx_ring_map_lock.lock(); resource_allocation_key key = 0; if (m_rx_ring_map.size()) { key = m_ring_alloc_logic.get_key(); } else { key = m_ring_alloc_logic.create_new_key(); } nd_resources.p_ring = nd_resources.p_ndv->reserve_ring(key); m_rx_ring_map_lock.unlock(); lock_rx_q(); if (!nd_resources.p_ring) { si_logdbg("Failed to reserve ring for allocation key %d on lip %s", m_ring_alloc_logic.get_key(), ip_local.to_str().c_str()); return false; } // Add new net_device to rx_map m_rx_nd_map[ip_local.get_in_addr()] = nd_resources; rx_nd_iter = m_rx_nd_map.find(ip_local.get_in_addr()); if (rx_nd_iter == m_rx_nd_map.end()) { si_logerr("Failed to find rx_nd_iter"); return false; } BULLSEYE_EXCLUDE_BLOCK_END }