/*
 * vr_nl_uvhost_connect - connect to the user space vhost server on a UNIX
 * domain socket.
 *
 * Returns 0 on success, error otherwise.
 */
static int
vr_nl_uvhost_connect(void)
{
    int s = 0, ret = -1, err;
    struct sockaddr_un nl_sun, uvh_sun;

    s = socket(AF_UNIX, SOCK_SEQPACKET, 0);
    if (s == -1) {
        RTE_LOG(ERR, VROUTER, "    error creating uvhost socket: %s (%d)\n",
                        rte_strerror(errno), errno);
        goto error;
    }
    RTE_LOG(INFO, VROUTER, "    uvhost Unix socket FD is %d\n", s);

    memset(&nl_sun, 0, sizeof(nl_sun));
    nl_sun.sun_family = AF_UNIX;
    strncpy(nl_sun.sun_path, vr_socket_dir, sizeof(nl_sun.sun_path) - 1);
    strncat(nl_sun.sun_path, "/"VR_NL_UVH_SOCK_NAME, sizeof(nl_sun.sun_path)
        - strlen(nl_sun.sun_path) - 1);

    mkdir(vr_socket_dir, VR_DEF_SOCKET_DIR_MODE);
    unlink(nl_sun.sun_path);
    ret = bind(s, (struct sockaddr *) &nl_sun, sizeof(nl_sun));
    if (ret == -1) {
        RTE_LOG(ERR, VROUTER, "    error binding uvhost FD %d to %s: %s (%d)\n",
                        s, nl_sun.sun_path, rte_strerror(errno), errno);
        goto error;
    }

    /*
     * This will block until the user space vhost thread listens on the
     * socket.
     */
    memset(&uvh_sun, 0, sizeof(uvh_sun));
    uvh_sun.sun_family = AF_UNIX;
    strncpy(uvh_sun.sun_path, vr_socket_dir, sizeof(uvh_sun.sun_path) - 1);
    strncat(uvh_sun.sun_path, "/"VR_UVH_NL_SOCK_NAME, sizeof(uvh_sun.sun_path)
        - strlen(uvh_sun.sun_path) - 1);

    ret = vr_dpdk_retry_connect(s, (struct sockaddr *) &uvh_sun, sizeof(uvh_sun));
    if (ret == -1) {
        RTE_LOG(ERR, VROUTER, "    error connecting uvhost socket FD %d to %s:"
            " %s (%d)\n", s, uvh_sun.sun_path, rte_strerror(errno), errno);
        goto error;
    }

    vr_nl_uvh_sock = s;

    return 0;

error:

    err = errno;
    if (s > 0) {
        close(s);
    }
    errno = err;

    return ret;
}
Beispiel #2
0
/* Get generic traffic manager operations structure from a port. */
const struct rte_tm_ops *
rte_tm_ops_get(uint16_t port_id, struct rte_tm_error *error)
{
	struct rte_eth_dev *dev = &rte_eth_devices[port_id];
	const struct rte_tm_ops *ops;

	if (!rte_eth_dev_is_valid_port(port_id)) {
		rte_tm_error_set(error,
			ENODEV,
			RTE_TM_ERROR_TYPE_UNSPECIFIED,
			NULL,
			rte_strerror(ENODEV));
		return NULL;
	}

	if ((dev->dev_ops->tm_ops_get == NULL) ||
		(dev->dev_ops->tm_ops_get(dev, &ops) != 0) ||
		(ops == NULL)) {
		rte_tm_error_set(error,
			ENOSYS,
			RTE_TM_ERROR_TYPE_UNSPECIFIED,
			NULL,
			rte_strerror(ENOSYS));
		return NULL;
	}

	return ops;
}
Beispiel #3
0
/* Setup ethdev hardware queues */
static int
dpdk_ethdev_queues_setup(struct vr_dpdk_ethdev *ethdev)
{
    int ret, i;
    uint8_t port_id = ethdev->ethdev_port_id;
    struct rte_mempool *mempool;

    /* configure RX queues */
    RTE_LOG(DEBUG, VROUTER, "%s: nb_rx_queues=%u nb_tx_queues=%u\n",
        __func__, (unsigned)ethdev->ethdev_nb_rx_queues,
            (unsigned)ethdev->ethdev_nb_tx_queues);

    for (i = 0; i < VR_DPDK_MAX_NB_RX_QUEUES; i++) {
        if (i < ethdev->ethdev_nb_rss_queues) {
            mempool = vr_dpdk.rss_mempool;
            ethdev->ethdev_queue_states[i] = VR_DPDK_QUEUE_RSS_STATE;
        } else if (i < ethdev->ethdev_nb_rx_queues) {
            if (vr_dpdk.nb_free_mempools == 0) {
                RTE_LOG(ERR, VROUTER, "    error assigning mempool to eth device %"
                    PRIu8 " RX queue %d\n", port_id, i);
                return -ENOMEM;
            }
            vr_dpdk.nb_free_mempools--;
            mempool = vr_dpdk.free_mempools[vr_dpdk.nb_free_mempools];
            ethdev->ethdev_queue_states[i] = VR_DPDK_QUEUE_READY_STATE;
        } else {
            ethdev->ethdev_queue_states[i] = VR_DPDK_QUEUE_NONE;
            continue;
        }

        ret = rte_eth_rx_queue_setup(port_id, i, VR_DPDK_NB_RXD,
            rte_eth_dev_socket_id(port_id), &rx_queue_conf, mempool);
        if (ret < 0) {
            /* return mempool to the list */
            if (mempool != vr_dpdk.rss_mempool)
                vr_dpdk.nb_free_mempools++;
            RTE_LOG(ERR, VROUTER, "    error setting up eth device %" PRIu8 " RX queue %d"
                    ": %s (%d)\n", port_id, i, rte_strerror(-ret), -ret);
            return ret;
        }
        /* save queue mempool pointer */
        ethdev->ethdev_mempools[i] = mempool;
    }
    i = ethdev->ethdev_nb_rx_queues - ethdev->ethdev_nb_rss_queues;
    RTE_LOG(INFO, VROUTER, "    setup %d RSS queue(s) and %d filtering queue(s)\n",
        (int)ethdev->ethdev_nb_rss_queues, i);

    /* configure TX queues */
    for (i = 0; i < ethdev->ethdev_nb_tx_queues; i++) {
        ret = rte_eth_tx_queue_setup(port_id, i, VR_DPDK_NB_TXD,
            rte_eth_dev_socket_id(port_id), &tx_queue_conf);
        if (ret < 0) {
            RTE_LOG(ERR, VROUTER, "    error setting up eth device %" PRIu8 " TX queue %d"
                    ": %s (%d)\n", port_id, i, rte_strerror(-ret), -ret);
            return ret;
        }
    }
    return 0;
}
Beispiel #4
0
/*****************************************************************************
 * trace_init()
 ****************************************************************************/
bool trace_init(void)
{
    int      error;
    uint32_t i;

    /*
     * Add tracing module CLI commands
     */
    if (!trace_cli_init()) {
        RTE_LOG(ERR, USER1, "ERROR: Can't add tracing specific CLI commands!\n");
        return false;
    }

    /*
     * Register the handlers for our message types.
     */
    error = msg_register_handler(MSG_TRACE_ENABLE, trace_handle_enable);
    if (error) {
        RTE_LOG(ERR, USER1, "Failed to register Trace msg handler: %s(%d)\n",
                rte_strerror(-error), -error);
        return false;
    }

    error = msg_register_handler(MSG_TRACE_DISABLE,
                                 trace_handle_disable);
    if (error) {
        RTE_LOG(ERR, USER1, "Failed to register Trace msg handler: %s(%d)\n",
                rte_strerror(-error), -error);
        return false;
    }

    error = msg_register_handler(MSG_TRACE_XCHG_PTR,
                                 trace_handle_xchg_ptr);
    if (error) {
        RTE_LOG(ERR, USER1, "Failed to register Trace msg handler: %s(%d)\n",
                rte_strerror(-error), -error);
        return false;
    }

    /* Initialize the defined trace components. */
    for (i = 0; i < TRACE_MAX; i++) {
        error = trace_init_component(i);
        if (error) {
            RTE_LOG(ERR, USER1, "Failed to init Trace component: %s(%d)\n",
                    rte_strerror(-error), -error);
            return false;
        }
    }

    return true;
}
Beispiel #5
0
int
rte_vhost_get_numa_node(int vid)
{
#ifdef RTE_LIBRTE_VHOST_NUMA
	struct virtio_net *dev = get_device(vid);
	int numa_node;
	int ret;

	if (dev == NULL)
		return -1;

	ret = get_mempolicy(&numa_node, NULL, 0, dev,
			    MPOL_F_NODE | MPOL_F_ADDR);
	if (ret < 0) {
		RTE_LOG(ERR, VHOST_CONFIG,
			"(%d) failed to query numa node: %s\n",
			vid, rte_strerror(errno));
		return -1;
	}

	return numa_node;
#else
	RTE_SET_USED(vid);
	return -1;
#endif
}
Beispiel #6
0
static struct rte_mempool *
pktgen_mbuf_pool_create(const char * type, uint8_t pid, uint8_t queue_id,
		uint32_t nb_mbufs, int socket_id, int cache_size )
{
    struct rte_mempool * mp;
    char    name[RTE_MEMZONE_NAMESIZE];

    snprintf(name, sizeof(name), "%-12s%u:%u", type, pid, queue_id);
    pktgen_log_info("    Create: %-*s - Memory used (MBUFs %4u x (size %u + Hdr %lu)) + %lu = %6lu KB",
            16, name, nb_mbufs, MBUF_SIZE, sizeof(struct rte_mbuf), sizeof(struct rte_mempool),
            (((nb_mbufs * (MBUF_SIZE + sizeof(struct rte_mbuf)) + sizeof(struct rte_mempool))) + 1023)/1024);
    pktgen.mem_used += ((nb_mbufs * (MBUF_SIZE + sizeof(struct rte_mbuf)) + sizeof(struct rte_mempool)));
    pktgen.total_mem_used += ((nb_mbufs * (MBUF_SIZE + sizeof(struct rte_mbuf)) + sizeof(struct rte_mempool)));

    /* create the mbuf pool */
    mp = rte_mempool_create(name, nb_mbufs, MBUF_SIZE, cache_size,
                   sizeof(struct rte_pktmbuf_pool_private),
                   rte_pktmbuf_pool_init, NULL,
                   rte_pktmbuf_init, NULL,
                   socket_id, MEMPOOL_F_DMA);
    if (mp == NULL)
        pktgen_log_panic("Cannot create mbuf pool (%s) port %d, queue %d, nb_mbufs %d, socket_id %d: %s",
        		name, pid, queue_id, nb_mbufs, socket_id, rte_strerror(errno));

    return mp;
}
static void
dpdk_ethdev_reta_show(uint8_t port_id, uint16_t reta_size)
{
    int nb_entries = reta_size/RTE_RETA_GROUP_SIZE;
    struct rte_eth_rss_reta_entry64 reta_entries[nb_entries];
    struct rte_eth_rss_reta_entry64 *reta;
    uint16_t i, idx, shift;
    int ret, entry;

    for (entry = 0; entry < nb_entries; entry++) {
        reta = &reta_entries[entry];

        /* reset RSS redirection table */
        memset(reta, 0, sizeof(*reta));
        reta->mask = 0xffffffffffffffffULL;
    }

    ret = rte_eth_dev_rss_reta_query(port_id, reta_entries, reta_size);
    if (ret != 0) {
        RTE_LOG(ERR, VROUTER, "Error getting RSS RETA info: %s (%d)\n",
            rte_strerror(ret), ret);
        return;
    }

    for (i = 0; i < reta_size; i++) {
        idx = i / RTE_RETA_GROUP_SIZE;
        shift = i % RTE_RETA_GROUP_SIZE;
        if (!(reta_entries[idx].mask & (1ULL << shift)))
            continue;
        RTE_LOG(DEBUG, VROUTER, "        hash index=%u, queue=%u\n",
                    i, reta_entries[idx].reta[shift]);
    }
}
/*
 * vr_netlink_uvhost_vif_add - sends a message to the user space vhost
 * thread when a new vif is created. The name os the vif is specified in
 * the vif_name argument.
 *
 * Returns 0 on success, -1 otherwise.
 */
int
vr_netlink_uvhost_vif_add(char *vif_name, unsigned int vif_idx,
                          unsigned int vif_gen, unsigned int vif_nrxqs,
                          unsigned int vif_ntxqs,
                          unsigned char vif_vhostuser_mode)
{
    vrnu_msg_t msg;

    memset(&msg, 0, sizeof(msg));
    msg.vrnum_type = VRNU_MSG_VIF_ADD;
    strncpy(msg.vrnum_vif_add.vrnu_vif_name, vif_name,
            sizeof(msg.vrnum_vif_add.vrnu_vif_name) - 1);
    msg.vrnum_vif_add.vrnu_vif_idx = vif_idx;
    msg.vrnum_vif_add.vrnu_vif_nrxqs = vif_nrxqs;
    msg.vrnum_vif_add.vrnu_vif_ntxqs = vif_ntxqs;
    msg.vrnum_vif_add.vrnu_vif_gen = vif_gen;
    msg.vrnum_vif_add.vrnu_vif_vhostuser_mode = vif_vhostuser_mode;

    /*
     * This is a blocking send.
     */
    if (send(vr_nl_uvh_sock, (void *) &msg, sizeof(msg), 0) !=
             sizeof(msg)) {
        RTE_LOG(ERR, VROUTER, "    error adding vif %u to user space vhost:"
            " %s (%d)\n", vif_idx, rte_strerror(errno), errno);
        return -1;
    }

    return 0;
}
/* Init NetLink and UVHost sockets */
int
vr_dpdk_netlink_init(void)
{
    void *event_sock = NULL;
    int ret;

    RTE_LOG(INFO, VROUTER, "Starting NetLink...\n");
    ret = vr_message_transport_register(&dpdk_nl_transport);
    if (ret)
        return ret;

#ifdef AGENT_VROUTER_TCP
    vr_dpdk.netlink_sock = vr_usocket(NETLINK, TCP);
#else
    vr_dpdk.netlink_sock = vr_usocket(NETLINK, UNIX);
#endif
    if (!vr_dpdk.netlink_sock) {
        RTE_LOG(ERR, VROUTER, "    error creating NetLink server socket:"
            " %s (%d)\n", rte_strerror(errno), errno);
        goto error;
    }
    RTE_LOG(INFO, VROUTER, "    NetLink TCP socket FD is %d\n",
            ((struct vr_usocket *)vr_dpdk.netlink_sock)->usock_fd);

    ret = vr_nl_uvhost_connect();
    if (ret != 0) {
        RTE_LOG(ERR, VROUTER, "    error creating uvhost connection\n");
        goto error;
    }

    /* create and bind event usock to wake up the NetLink lcore */
    event_sock = (void *)vr_usocket(EVENT, RAW);
    if (!event_sock) {
        RTE_LOG(ERR, VROUTER, "    error creating NetLink event\n");
        goto error;
    }

    if (vr_usocket_bind_usockets(vr_dpdk.netlink_sock,
                event_sock)) {
        RTE_LOG(ERR, VROUTER, "    error binding NetLink event\n");
        goto error;
    }
    vr_dpdk.netlink_event_sock = event_sock;

    return 0;

error:
    vr_message_transport_unregister(&dpdk_nl_transport);
    vr_usocket_close(vr_dpdk.netlink_sock);

    return -1;
}
Beispiel #10
0
void
setup_shared_variables(void)
{
    const struct rte_memzone *qw_memzone;

    qw_memzone = rte_memzone_reserve(QUOTA_WATERMARK_MEMZONE_NAME, 2 * sizeof(int),
                                     rte_socket_id(), RTE_MEMZONE_2MB);
    if (qw_memzone == NULL)
        rte_exit(EXIT_FAILURE, "%s\n", rte_strerror(rte_errno));

    quota = qw_memzone->addr;
    low_watermark = (unsigned int *) qw_memzone->addr + sizeof(int);
}
/* Init RSS */
int
vr_dpdk_ethdev_rss_init(struct vr_dpdk_ethdev *ethdev)
{
    int ret, i, j, entry;
    uint8_t port_id = ethdev->ethdev_port_id;
    int nb_entries = ethdev->ethdev_reta_size/RTE_RETA_GROUP_SIZE;
    struct rte_eth_rss_reta_entry64 reta_entries[VR_DPDK_MAX_RETA_ENTRIES];
    struct rte_eth_rss_reta_entry64 *reta;

    /* There is nothing to configure if the device does not support RETA.
     * If the device reported few RX queues earlier, we assume those
     * queues are preconfigured for RSS by default.
     */
    if (ethdev->ethdev_reta_size == 0)
        return 0;

    RTE_LOG(DEBUG, VROUTER, "%s: RSS RETA BEFORE:\n", __func__);
    dpdk_ethdev_reta_show(port_id, ethdev->ethdev_reta_size);

    for (entry = 0; entry < nb_entries; entry++) {
        reta = &reta_entries[entry];

        /* create new RSS redirection table */
        memset(reta, 0, sizeof(*reta));
        reta->mask = 0xffffffffffffffffULL;
        for (i = j = 0; i < RTE_RETA_GROUP_SIZE; i++) {
            reta->reta[i] = j++;
            if (ethdev->ethdev_queue_states[j] != VR_DPDK_QUEUE_RSS_STATE)
                j = 0;
        }
    }

    /* update RSS redirection table */
    ret = rte_eth_dev_rss_reta_update(port_id, reta_entries,
                ethdev->ethdev_reta_size);

    /* no error if the device does not support RETA configuration */
    if (ret == -ENOTSUP)
        return 0;

    if (ret < 0) {
        RTE_LOG(ERR, VROUTER, "    error initializing ethdev %" PRIu8 " RSS: %s (%d)\n",
            port_id, rte_strerror(-ret), -ret);
    }

    RTE_LOG(DEBUG, VROUTER, "%s: RSS RETA AFTER:\n", __func__);
    dpdk_ethdev_reta_show(port_id, ethdev->ethdev_reta_size);

    return ret;
}
Beispiel #12
0
void init_ring(int lcore_id, uint8_t port_id)
{
    struct rte_ring *ring;
    char ring_name[RTE_RING_NAMESIZE];

    rte_snprintf(ring_name, RTE_RING_NAMESIZE,
    		"core%d_port%d", lcore_id, port_id);
    ring = rte_ring_create(ring_name, RING_SIZE, rte_socket_id(),
                           RING_F_SP_ENQ | RING_F_SC_DEQ);

    if (ring == NULL)
        rte_exit(EXIT_FAILURE, "%s\n", rte_strerror(rte_errno));

    rte_ring_set_water_mark(ring, 80 * RING_SIZE / 100);

    rings[lcore_id][port_id] = ring;
}
Beispiel #13
0
struct rte_mempool* init_mem(uint32_t nb_mbuf, uint32_t socket, uint32_t mbuf_size) {
	static volatile uint32_t mbuf_cnt = 0;
	char pool_name[32];
	sprintf(pool_name, "mbuf_pool%d", __sync_fetch_and_add(&mbuf_cnt, 1));
	// rte_mempool_create is apparently not thread-safe :(
	static rte_spinlock_t lock = RTE_SPINLOCK_INITIALIZER;
	rte_spinlock_lock(&lock);
	struct rte_mempool* pool = rte_pktmbuf_pool_create(pool_name, nb_mbuf, MEMPOOL_CACHE_SIZE,
		0, mbuf_size + RTE_PKTMBUF_HEADROOM,
		socket
	);
	rte_spinlock_unlock(&lock);
	if (!pool) {
		printf("Memory allocation failed: %s (%d)\n", rte_strerror(-rte_errno), rte_errno); 
		return 0;
	}
	return pool;
}
Beispiel #14
0
struct rte_mempool* init_mem(uint32_t nb_mbuf, int32_t socket) {
	static volatile uint32_t mbuf_cnt = 0;
	char pool_name[32];
	sprintf(pool_name, "mbuf_pool%d", __sync_fetch_and_add(&mbuf_cnt, 1));
	// rte_mempool_create is apparently not thread-safe :(
	static rte_spinlock_t lock = RTE_SPINLOCK_INITIALIZER;
	rte_spinlock_lock(&lock);
	struct rte_mempool* pool = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE, MEMPOOL_CACHE_SIZE,
		sizeof(struct rte_pktmbuf_pool_private),
		rte_pktmbuf_pool_init, NULL,
		rte_pktmbuf_init, NULL,
		socket < 0 ? rte_socket_id() : (uint32_t) socket, 0
	);
	rte_spinlock_unlock(&lock);
	if (!pool) {
		printf("Memory allocation failed: %s (%d)\n", rte_strerror(rte_errno), rte_errno); 
		return 0;
	}
	return pool;
}
Beispiel #15
0
/*****************************************************************************
 * start_cores()
 ****************************************************************************/
static void start_cores(void)
{
    uint32_t core;

    /*
     * Fire up the packet processing cores
     */
    RTE_LCORE_FOREACH_SLAVE(core) {
        int index = rte_lcore_index(core);

        switch (index) {
        case TPG_CORE_IDX_CLI:
            assert(false);
        break;
        case TPG_CORE_IDX_TEST_MGMT:
            rte_eal_remote_launch(test_mgmt_loop, NULL, core);
        break;
        default:
            assert(index >= TPG_NR_OF_NON_PACKET_PROCESSING_CORES);
            rte_eal_remote_launch(pkt_receive_loop, NULL, core);
        }
    }

    /*
     * Wait for packet cores to finish initialization.
     */
    RTE_LCORE_FOREACH_SLAVE(core) {
        int   error;
        msg_t msg;

        if (!cfg_is_pkt_core(core))
            continue;

        msg_init(&msg, MSG_PKTLOOP_INIT_WAIT, core, 0);
        /* BLOCK waiting for msg to be processed */
        error = msg_send(&msg, 0);
        if (error)
            TPG_ERROR_ABORT("ERROR: Failed to send pktloop init wait msg: %s(%d)!\n",
                            rte_strerror(-error), -error);
    }
}
/*
 * vr_netlink_uvhost_vif_del - sends a message to the user space vhost
 * thread when a vif is deleted. vif_idx is the index of the vif.
 *
 * Returns 0 on success, -1 otherwise.
 */
int
vr_netlink_uvhost_vif_del(unsigned int vif_idx)
{
    vrnu_msg_t msg;

    memset(&msg, 0, sizeof(msg));
    msg.vrnum_type = VRNU_MSG_VIF_DEL;
    msg.vrnum_vif_del.vrnu_vif_idx = vif_idx;

    /*
     * This is a blocking send.
     */
    if (send(vr_nl_uvh_sock, (void *) &msg, sizeof(msg), 0) !=
             sizeof(msg)) {
        RTE_LOG(ERR, VROUTER, "    error deleting vif %u from user space vhost:"
            " %s (%d)\n", vif_idx, rte_strerror(errno), errno);
        return -1;
    }

    return 0;
}
/* Init ethernet device */
int
vr_dpdk_ethdev_init(struct vr_dpdk_ethdev *ethdev)
{
    uint8_t port_id;
    int ret;

    port_id = ethdev->ethdev_port_id;
    ethdev->ethdev_ptr = &rte_eth_devices[port_id];

    dpdk_ethdev_info_update(ethdev);

    ret = rte_eth_dev_configure(port_id, ethdev->ethdev_nb_rx_queues,
        ethdev->ethdev_nb_tx_queues, &ethdev_conf);
    if (ret < 0) {
        RTE_LOG(ERR, VROUTER, "    error configuring eth dev %" PRIu8
                ": %s (%d)\n",
            port_id, rte_strerror(-ret), -ret);
        return ret;
    }

    /* update device bond information after the device has been configured */
    if (ethdev->ethdev_ptr->driver) /* af_packet has no driver and no bond info */
        dpdk_ethdev_bond_info_update(ethdev);

    ret = dpdk_ethdev_queues_setup(ethdev);
    if (ret < 0)
        return ret;

    /* Promisc mode
     * KNI generates random MACs for e1000e NICs, so we need this
     * option enabled for the development on servers with those NICs
     */
#if VR_DPDK_ENABLE_PROMISC
    rte_eth_promiscuous_enable(port_id);
#endif

    return 0;
}
/* Init hardware filtering */
int
vr_dpdk_ethdev_filtering_init(struct vr_interface *vif,
        struct vr_dpdk_ethdev *ethdev)
{
    int ret;
    uint8_t port_id = ethdev->ethdev_port_id;
    struct rte_fdir_masks masks;
    struct rte_eth_fdir fdir_info;

    /* probe Flow Director */
    memset(&fdir_info, 0, sizeof(fdir_info));
    ret = rte_eth_dev_fdir_get_infos(port_id, &fdir_info);
    if (ret == 0) {
        /* enable hardware filtering */
        RTE_LOG(INFO, VROUTER, "    enable hardware filtering for ethdev %"
            PRIu8 "\n", port_id);
        vif->vif_flags |= VIF_FLAG_FILTERING_OFFLOAD;
    } else {
        vif->vif_flags &= ~VIF_FLAG_FILTERING_OFFLOAD;
        /* free filtering mempools */
        dpdk_ethdev_mempools_free(ethdev);
        /* the ethdev does not support hardware filtering - it's not an error */
        return 0;
    }

    memset(&masks, 0, sizeof(masks));
    masks.dst_ipv4_mask = 0xffffffff;
    masks.dst_port_mask = 0xffff;
    masks.flexbytes = 1;

    ret = rte_eth_dev_fdir_set_masks(port_id, &masks);
    if (ret < 0) {
        RTE_LOG(ERR, VROUTER, "    error setting ethdev %" PRIu8
            " Flow Director masks: %s (%d)\n", port_id, rte_strerror(-ret), -ret);
    }

    return ret;
}
static void
vr_dpdk_packet_ring_drain(struct vr_usocket *usockp)
{
    int i;
    unsigned nb_pkts;
    struct rte_mbuf *mbuf_arr[VR_DPDK_RX_BURST_SZ];
    const unsigned lcore_id = rte_lcore_id();
    struct vr_interface_stats *stats;

    RTE_LOG(DEBUG, USOCK, "%s[%lx]: draining packet ring...\n", __func__,
            pthread_self());

    if (unlikely(usockp->usock_parent->usock_vif == NULL))
        return;

    rcu_thread_offline();

    stats = vif_get_stats(usockp->usock_parent->usock_vif, lcore_id);
    do {
        nb_pkts = rte_ring_sc_dequeue_burst(vr_dpdk.packet_ring,
            (void **)&mbuf_arr, VR_DPDK_RX_BURST_SZ);
        for (i = 0; i < nb_pkts; i++) {
            if (usock_mbuf_write(usockp->usock_parent, mbuf_arr[i]) >= 0)
                stats->vis_port_opackets++;
            else {
                stats->vis_port_oerrors++;
                RTE_LOG(DEBUG, USOCK,
                        "%s: Error writing mbuf to packet socket: %s (%d)\n",
                        __func__, rte_strerror(errno), errno);
            }

            rte_pktmbuf_free(mbuf_arr[i]);
        }
    } while (nb_pkts > 0);

    rcu_thread_online();
}
void *                
intel_pool_init(const char *name)
{
  struct rte_mempool *pktmbuf_pool = NULL;
  
  pktmbuf_pool =
    rte_mempool_create(name,
                       NB_MBUF,
                       MBUF_SIZE,
                       MBUF_CACHE_SIZE,
                       sizeof(struct rte_pktmbuf_pool_private),
                       rte_pktmbuf_pool_init,
                       NULL,
                       rte_pktmbuf_init,
                       NULL,
                       DEVICE_SOCKET,
                       0);
  
  if(pktmbuf_pool == NULL) {
    rte_panic("Cannot init mbuf pool on socket 0! rte_errno:%i(%s)\n\n", rte_errno, rte_strerror(rte_errno));
  }

  return pktmbuf_pool;
}
Beispiel #21
0
/* Get number of leaf nodes */
int
rte_tm_get_number_of_leaf_nodes(uint16_t port_id,
	uint32_t *n_leaf_nodes,
	struct rte_tm_error *error)
{
	struct rte_eth_dev *dev = &rte_eth_devices[port_id];
	const struct rte_tm_ops *ops =
		rte_tm_ops_get(port_id, error);

	if (ops == NULL)
		return -rte_errno;

	if (n_leaf_nodes == NULL) {
		rte_tm_error_set(error,
			EINVAL,
			RTE_TM_ERROR_TYPE_UNSPECIFIED,
			NULL,
			rte_strerror(EINVAL));
		return -rte_errno;
	}

	*n_leaf_nodes = dev->data->nb_tx_queues;
	return 0;
}
void send_loop(void)
{
	RTE_LOG(INFO, APP, "send_loop()\n");
	char pkt[PKT_SIZE] = {0};
	int nreceived;

	int retval = 0;
	(void) retval;
#ifdef CALC_CHECKSUM
	unsigned int kk = 0;
#endif
	srand(time(NULL));

	//Initializate packet contents
	int i;
	for(i = 0; i < PKT_SIZE; i++)
		pkt[i] = rand()%256;

#if ALLOC_METHOD == ALLOC_APP
	struct rte_mempool * packets_pool = rte_mempool_lookup("ovs_mp_1500_0_262144");
	//struct rte_mempool * packets_pool = rte_mempool_lookup("packets");

	//Create mempool
	//struct rte_mempool * packets_pool = rte_mempool_create(
	//	"packets",
	//	NUM_PKTS,
	//	MBUF_SIZE,
	//	CACHE_SIZE,					//This is the size of the mempool cache
	//	sizeof(struct rte_pktmbuf_pool_private),
	//	rte_pktmbuf_pool_init,
	//	NULL,
	//	rte_pktmbuf_init,
	//	NULL,
	//	rte_socket_id(),
	//	0 /*NO_FLAGS*/);


	if(packets_pool == NULL)
	{
		RTE_LOG(INFO, APP, "rte_errno: %s\n", rte_strerror(rte_errno));
		rte_exit(EXIT_FAILURE, "Cannot find memory pool\n");
	}

	RTE_LOG(INFO, APP, "There are %d free packets in the pool\n",
		rte_mempool_count(packets_pool));

#endif

#ifdef USE_BURST
	struct rte_mbuf * packets_array[BURST_SIZE] = {0};
	struct rte_mbuf * packets_array_rx[BURST_SIZE] = {0};
	int ntosend;
	int n;
	(void) n;

	/* prealloc packets */
	do
	{
		n = rte_mempool_get_bulk(packets_pool, (void **) packets_array, BURST_SIZE);
	} while(n != 0 && !stop);
	ntosend = BURST_SIZE;

#else
	struct rte_mbuf * mbuf;
	/* prealloc packet */
	do {
		mbuf = rte_pktmbuf_alloc(packets_pool);
	} while(mbuf == NULL);

#endif

	RTE_LOG(INFO, APP, "Starting sender loop\n");
	signal (SIGINT, crtl_c_handler);
	stop = 0;
	while(likely(!stop))
	{
		while(pause_);
#ifdef USE_BURST

	#if ALLOC_METHOD == ALLOC_OVS
		//Try to get BURS_SIZE free slots
		ntosend = rte_ring_dequeue_burst(alloc_q, (void **) packets_array, BURST_SIZE);
	#elif ALLOC_METHOD == ALLOC_APP
		//do
		//{
		//	n = rte_mempool_get_bulk(packets_pool, (void **) packets_array, BURST_SIZE);
		//} while(n != 0 && !stop);
		//ntosend = BURST_SIZE;
	#else
		#error "No implemented"
	#endif

		//Copy data to the buffers
		for(i = 0; i < ntosend; i++)
		{
			rte_memcpy(packets_array[i]->buf_addr, pkt, PKT_SIZE);
			//fill_packet(packets_array[i]->pkt.data);
			packets_array[i]->next = NULL;
			packets_array[i]->pkt_len = PKT_SIZE;
			packets_array[i]->data_len = PKT_SIZE;

		#ifdef CALC_CHECKSUM
			for(i = 0; i < ntosend; i++)
				for(kk = 0; kk < 8; kk++)
					checksum += ((uint64_t *)packets_array[i]->buf_addr)[kk];
		#endif
		}

		//Enqueue data (try until all the allocated packets are enqueue)
		i = 0;
		while(i < ntosend && !stop)
		{
			i += rte_ring_enqueue_burst(tx_ring, (void **) &packets_array[i], ntosend - i);

			/* also dequeue some packets */
			nreceived= rte_ring_dequeue_burst(rx_ring, (void **) packets_array_rx, BURST_SIZE);
			rx += nreceived; /* update statistics */
		}

#else	// [NO] USE_BURST
	#if ALLOC_METHOD  == ALLOC_OVS //Method 1
		//Read a buffer to be used as a buffer for a packet
		retval = rte_ring_dequeue(alloc_q, (void **)&mbuf);
		if(retval != 0)
		{
		#ifdef CALC_ALLOC_STATS
			//stats.alloc_fails++;
		#endif
			continue;
		}
	#elif ALLOC_METHOD  == ALLOC_APP //Method 2
		//mbuf = rte_pktmbuf_alloc(packets_pool);
		//if(mbuf == NULL)
		//{
		//#ifdef CALC_ALLOC_STATS
		//	stats.alloc_fails++;
		//#endif
		//	continue;
		//}
	#else
		#error "ALLOC_METHOD has a non valid value"
	#endif

	#if DELAY_CYCLES > 0
		//This loop increases mumber of packets per second (don't ask me why)
		unsigned long long j = 0;
		for(j = 0; j < DELAY_CYCLES; j++)
			asm("");
	#endif

		//Copy packet to the correct buffer
		rte_memcpy(mbuf->buf_addr, pkt, PKT_SIZE);
		//fill_packet(mbuf->pkt.data);
		//mbuf->pkt.next = NULL;
		//mbuf->pkt.pkt_len = PKT_SIZE;
		//mbuf->pkt.data_len = PKT_SIZE;
		(void) pkt;
		mbuf->next = NULL;
		mbuf->pkt_len = PKT_SIZE;
		mbuf->data_len = PKT_SIZE;

	#ifdef CALC_CHECKSUM
		for(kk = 0; kk < 8; kk++)
			checksum += ((uint64_t *)mbuf->buf_addr)[kk];
	#endif

		//this method avoids dropping packets:
		//Simple tries until the packet is inserted in the queue
		tryagain:
		retval = rte_ring_enqueue(tx_ring, (void *) mbuf);
		if(retval == -ENOBUFS && !stop)
		{
	#ifdef CALC_TX_TRIES
			//stats.tx_retries++;
	#endif
			goto tryagain;
		}

	#ifdef CALC_TX_STATS
		//stats.tx++;
	#endif

#endif //USE_BURST
	}

#ifdef CALC_CHECKSUM
	printf("Checksum was %" PRIu64 "\n", checksum);
#endif

}
Beispiel #23
0
void pktgen_config_ports(void)
{
    uint32_t lid, pid, i, s, q, sid;
    rxtx_t	rt;
    pkt_seq_t   * pkt;
    port_info_t     * info;
    char buff[RTE_MEMZONE_NAMESIZE];
    int32_t ret, cache_size;
	char output_buff[256] = { 0 };

    // Find out the total number of ports in the system.
    // We have already blacklisted the ones we needed to in main routine.
    pktgen.nb_ports = rte_eth_dev_count();
    if (pktgen.nb_ports > RTE_MAX_ETHPORTS)
        pktgen.nb_ports = RTE_MAX_ETHPORTS;

    if ( pktgen.nb_ports == 0 )
		pktgen_log_panic("*** Did not find any ports to use ***");

    pktgen.starting_port = 0;

    // Setup the number of ports to display at a time
	if ( pktgen.nb_ports > pktgen.nb_ports_per_page )
		pktgen.ending_port = pktgen.starting_port + pktgen.nb_ports_per_page;
	else
		pktgen.ending_port = pktgen.starting_port + pktgen.nb_ports;

    wr_port_matrix_dump(pktgen.l2p);

    pktgen_log_info("Configuring %d ports, MBUF Size %d, MBUF Cache Size %d",
    		pktgen.nb_ports, MBUF_SIZE, MBUF_CACHE_SIZE);

    // For each lcore setup each port that is handled by that lcore.
    for(lid = 0; lid < RTE_MAX_LCORE; lid++) {

        if ( wr_get_map(pktgen.l2p, RTE_MAX_ETHPORTS, lid) == 0 )
            continue;

		// For each port attached or handled by the lcore
        for(pid = 0; pid < pktgen.nb_ports; pid++) {

        	// If non-zero then this port is handled by this lcore.
            if ( wr_get_map(pktgen.l2p, pid, lid) == 0 )
                continue;
        	wr_set_port_private(pktgen.l2p, pid, &pktgen.info[pid]);
        	pktgen.info[pid].pid = pid;
        }
    }
    wr_dump_l2p(pktgen.l2p);

    pktgen.total_mem_used = 0;

    for(pid = 0; pid < pktgen.nb_ports; pid++) {
    	// Skip if we do not have any lcores attached to a port.
    	if ( (rt.rxtx = wr_get_map(pktgen.l2p, pid, RTE_MAX_LCORE)) == 0 )
            continue;

		pktgen.port_cnt++;
		snprintf(output_buff, sizeof(output_buff),
				"Initialize Port %d -- TxQ %d, RxQ %d", pid, rt.tx, rt.rx);

        info = wr_get_port_private(pktgen.l2p, pid);

		// Create the pkt header structures for transmitting sequence of packets.
		snprintf(buff, sizeof(buff), "seq_hdr_%d", pid);
		info->seq_pkt = (pkt_seq_t *)rte_zmalloc(buff, (sizeof(pkt_seq_t) * NUM_TOTAL_PKTS), RTE_CACHE_LINE_SIZE);
		if ( info->seq_pkt == NULL )
			pktgen_log_panic("Unable to allocate %d pkt_seq_t headers", NUM_TOTAL_PKTS);

		info->seqIdx    = 0;
		info->seqCnt    = 0;

		info->nb_mbufs  = MAX_MBUFS_PER_PORT;
		cache_size = (info->nb_mbufs > RTE_MEMPOOL_CACHE_MAX_SIZE)?
							RTE_MEMPOOL_CACHE_MAX_SIZE : info->nb_mbufs;

		pktgen_port_conf_setup(pid, &rt, &default_port_conf);

		if ( (ret = rte_eth_dev_configure(pid, rt.rx, rt.tx, &info->port_conf)) < 0)
			pktgen_log_panic("Cannot configure device: port=%d, Num queues %d,%d (%d)%s",
					pid, rt.rx, rt.tx, errno, rte_strerror(-ret));

		pkt = &info->seq_pkt[SINGLE_PKT];

		// Grab the source MAC addresses */
		rte_eth_macaddr_get(pid, &pkt->eth_src_addr);
		pktgen_log_info("%s,  Src MAC %02x:%02x:%02x:%02x:%02x:%02x", output_buff,
				pkt->eth_src_addr.addr_bytes[0],
				pkt->eth_src_addr.addr_bytes[1],
				pkt->eth_src_addr.addr_bytes[2],
				pkt->eth_src_addr.addr_bytes[3],
				pkt->eth_src_addr.addr_bytes[4],
				pkt->eth_src_addr.addr_bytes[5]);

		// Copy the first Src MAC address in SINGLE_PKT to the rest of the sequence packets.
		for (i = 0; i < NUM_SEQ_PKTS; i++)
			ethAddrCopy( &info->seq_pkt[i].eth_src_addr, &pkt->eth_src_addr );

		pktgen.mem_used = 0;

		for(q = 0; q < rt.rx; q++) {
			// grab the socket id value based on the lcore being used.
			sid		= rte_lcore_to_socket_id(wr_get_port_lid(pktgen.l2p, pid, q));

			// Create and initialize the default Receive buffers.
			info->q[q].rx_mp = pktgen_mbuf_pool_create("Default RX", pid, q, info->nb_mbufs, sid, cache_size);
			if ( info->q[q].rx_mp == NULL )
				pktgen_log_panic("Cannot init port %d for Default RX mbufs", pid);

			ret = rte_eth_rx_queue_setup(pid, q, pktgen.nb_rxd, sid, &info->rx_conf, pktgen.info[pid].q[q].rx_mp);
			if (ret < 0)
				pktgen_log_panic("rte_eth_rx_queue_setup: err=%d, port=%d, %s", ret, pid, rte_strerror(-ret));
		}
		pktgen_log_info("");

		for(q = 0; q < rt.tx; q++) {
			// grab the socket id value based on the lcore being used.
			sid		= rte_lcore_to_socket_id(wr_get_port_lid(pktgen.l2p, pid, q));

			// Create and initialize the default Transmit buffers.
			info->q[q].tx_mp = pktgen_mbuf_pool_create("Default TX", pid, q, MAX_MBUFS_PER_PORT, sid, cache_size);
			if ( info->q[q].tx_mp == NULL )
				pktgen_log_panic("Cannot init port %d for Default TX mbufs", pid);

			// Create and initialize the range Transmit buffers.
			info->q[q].range_mp = pktgen_mbuf_pool_create("Range TX", pid, q, MAX_MBUFS_PER_PORT,	sid, 0);
			if ( info->q[q].range_mp == NULL )
				pktgen_log_panic("Cannot init port %d for Range TX mbufs", pid);

			// Create and initialize the sequence Transmit buffers.
			info->q[q].seq_mp = pktgen_mbuf_pool_create("Sequence TX", pid, q, MAX_MBUFS_PER_PORT, sid, cache_size);
			if ( info->q[q].seq_mp == NULL )
				pktgen_log_panic("Cannot init port %d for Sequence TX mbufs", pid);

			// Used for sending special packets like ARP requests
			info->q[q].special_mp = pktgen_mbuf_pool_create("Special TX", pid, q, MAX_SPECIAL_MBUFS, sid, cache_size);
			if (info->q[q].special_mp == NULL)
				pktgen_log_panic("Cannot init port %d for Special TX mbufs", pid);

			// Setup the PCAP file for each port
			if ( pktgen.info[pid].pcap != NULL ) {
				if ( pktgen_pcap_parse(pktgen.info[pid].pcap, info, q) == -1 )
					pktgen_log_panic("Cannot load PCAP file for port %d", pid);
			}
			// Find out the link speed to program the WTHRESH value correctly.
			pktgen_get_link_status(info, pid, 0);

			//info->tx_conf.tx_thresh.wthresh = (info->link.link_speed == 1000)? TX_WTHRESH_1GB : TX_WTHRESH;

			ret = rte_eth_tx_queue_setup(pid, q, pktgen.nb_txd, sid, &info->tx_conf);
			if (ret < 0)
				pktgen_log_panic("rte_eth_tx_queue_setup: err=%d, port=%d, %s", ret, pid, rte_strerror(-ret));
#if 0
			ret = rte_eth_dev_flow_ctrl_set(pid, &fc_conf);
			if (ret < 0)
				pktgen_log_panic("rte_eth_dev_flow_ctrl_set: err=%d, port=%d, %s", ret, pid, rte_strerror(-ret));
#endif
			pktgen_log_info("");
		}
		pktgen_log_info("%*sPort memory used = %6lu KB", 71, " ", (pktgen.mem_used + 1023)/1024);
	}
    pktgen_log_info("%*sTotal memory used = %6lu KB", 70, " ", (pktgen.total_mem_used + 1023)/1024);

    // Start up the ports and display the port Link status
    for(pid = 0; pid < pktgen.nb_ports; pid++) {
        if ( wr_get_map(pktgen.l2p, pid, RTE_MAX_LCORE) == 0 )
            continue;

        info = wr_get_port_private(pktgen.l2p, pid);

        /* Start device */
        if ( (ret = rte_eth_dev_start(pid)) < 0 )
            pktgen_log_panic("rte_eth_dev_start: port=%d, %s", pid, rte_strerror(-ret));

        pktgen_get_link_status(info, pid, 1);

        if (info->link.link_status) {
            snprintf(output_buff, sizeof(output_buff), "Port %2d: Link Up - speed %u Mbps - %s", pid,
                   (uint32_t) info->link.link_speed,
                   (info->link.link_duplex == ETH_LINK_FULL_DUPLEX) ?
                   ("full-duplex") : ("half-duplex"));
        } else
            snprintf(output_buff, sizeof(output_buff), "Port %2d: Link Down", pid);


        // If enabled, put device in promiscuous mode.
        if (pktgen.flags & PROMISCUOUS_ON_FLAG) {
			strncatf(output_buff, " <Enable promiscuous mode>");
            rte_eth_promiscuous_enable(pid);
        }

		pktgen_log_info("%s", output_buff);
    	pktgen.info[pid].seq_pkt[SINGLE_PKT].pktSize = MIN_PKT_SIZE;

        // Setup the port and packet defaults. (must be after link speed is found)
        for (s = 0; s < NUM_TOTAL_PKTS; s++)
            pktgen_port_defaults(pid, s);

        pktgen_range_setup(info);

		pktgen_rnd_bits_init(&pktgen.info[pid].rnd_bitfields);
    }
	pktgen_log_info("");

	for (sid = 0; sid < wr_coremap_cnt(pktgen.core_info, pktgen.core_cnt, 0); sid++)
		pktgen_packet_capture_init(&pktgen.capture[sid], sid);
}
Beispiel #24
0
static int
test_errno(void)
{
	const char *rte_retval;
	const char *libc_retval;
#ifdef RTE_EXEC_ENV_BSDAPP
	/* BSD has a colon in the string, unlike linux */
	const char unknown_code_result[] = "Unknown error: %d";
#else
	const char unknown_code_result[] = "Unknown error %d";
#endif
	char expected_libc_retval[sizeof(unknown_code_result)+3];

	/* use a small selection of standard errors for testing */
	int std_errs[] = {EAGAIN, EBADF, EACCES, EINTR, EINVAL};
	/* test ALL registered RTE error codes for overlap */
	int rte_errs[] = {E_RTE_SECONDARY, E_RTE_NO_CONFIG, E_RTE_NO_TAILQ};
	unsigned i;

	rte_errno = 0;
	if (rte_errno != 0)
		return -1;
	/* check for standard errors we return the same as libc */
	for (i = 0; i < sizeof(std_errs)/sizeof(std_errs[0]); i++){
		rte_retval = rte_strerror(std_errs[i]);
		libc_retval = strerror(std_errs[i]);
		printf("rte_strerror: '%s', strerror: '%s'\n",
				rte_retval, libc_retval);
		if (strcmp(rte_retval, libc_retval) != 0)
			return -1;
	}
	/* for rte-specific errors ensure we return a different string
	 * and that the string for libc is for an unknown error
	 */
	for (i = 0; i < sizeof(rte_errs)/sizeof(rte_errs[0]); i++){
		rte_retval = rte_strerror(rte_errs[i]);
		libc_retval = strerror(rte_errs[i]);
		printf("rte_strerror: '%s', strerror: '%s'\n",
				rte_retval, libc_retval);
		if (strcmp(rte_retval, libc_retval) == 0)
			return -1;
		/* generate appropriate error string for unknown error number
		 * and then check that this is what we got back. If not, we have
		 * a duplicate error number that conflicts with errno.h */
		snprintf(expected_libc_retval, sizeof(expected_libc_retval),
				unknown_code_result, rte_errs[i]);
		if ((strcmp(expected_libc_retval, libc_retval) != 0) &&
				(strcmp("", libc_retval) != 0)){
			printf("Error, duplicate error code %d\n", rte_errs[i]);
			return -1;
		}
	}

	/* ensure that beyond RTE_MAX_ERRNO, we always get an unknown code */
	rte_retval = rte_strerror(RTE_MAX_ERRNO + 1);
	libc_retval = strerror(RTE_MAX_ERRNO + 1);
	snprintf(expected_libc_retval, sizeof(expected_libc_retval),
			unknown_code_result, RTE_MAX_ERRNO + 1);
	printf("rte_strerror: '%s', strerror: '%s'\n",
			rte_retval, libc_retval);
	if ((strcmp(rte_retval, libc_retval) != 0) ||
			(strcmp(expected_libc_retval, libc_retval) != 0)){
		if (strcmp("", libc_retval) != 0){
			printf("Failed test for RTE_MAX_ERRNO + 1 value\n");
			return -1;
		}
	}

	return 0;
}
/*
 * vr_uvh_cl_msg_handler - handler for messages from user space vhost
 * clients. Calls the appropriate handler based on the message type.
 *
 * Returns 0 on success, -1 on error.
 *
 * TODO: upon error, this function currently makes the process exit.
 * Instead, it should close the socket and continue serving other clients.
 */
static int
vr_uvh_cl_msg_handler(int fd, void *arg)
{
    vr_uvh_client_t *vru_cl = (vr_uvh_client_t *) arg;
    struct msghdr mhdr;
    struct iovec iov;
    int i, err, ret = 0, read_len = 0;
    struct cmsghdr *cmsg;

    memset(&mhdr, 0, sizeof(mhdr));

    if (vru_cl->vruc_msg_bytes_read == 0) {
        mhdr.msg_control = &vru_cl->vruc_cmsg;
        mhdr.msg_controllen = sizeof(vru_cl->vruc_cmsg);

        iov.iov_base = (void *) &vru_cl->vruc_msg;
        iov.iov_len = VHOST_USER_HSIZE;

        mhdr.msg_iov = &iov;
        mhdr.msg_iovlen = 1;

        ret = recvmsg(fd, &mhdr, MSG_DONTWAIT);
        if (ret < 0) {
            if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) {
                ret = 0;
                goto cleanup;
            }

            vr_uvhost_log("Receive returned %d in vhost server for client %s\n",
                          ret, vru_cl->vruc_path);
            ret = -1;
            goto cleanup;
        } else if (ret > 0) {
            if (mhdr.msg_flags & MSG_CTRUNC) {
                vr_uvhost_log("Truncated control message from vhost client %s\n",
                             vru_cl->vruc_path);
                ret = -1;
                goto cleanup;
            }

            cmsg = CMSG_FIRSTHDR(&mhdr);
            if (cmsg && (cmsg->cmsg_len > 0) &&
                   (cmsg->cmsg_level == SOL_SOCKET) &&
                   (cmsg->cmsg_type == SCM_RIGHTS)) {
                   vru_cl->vruc_num_fds_sent = (cmsg->cmsg_len - CMSG_LEN(0))/
                                                   sizeof(int);
                   if (vru_cl->vruc_num_fds_sent > VHOST_MEMORY_MAX_NREGIONS) {
                        vr_uvhost_log("Too many FDs sent for client %s: %d\n",
                                vru_cl->vruc_path,  vru_cl->vruc_num_fds_sent);
                       vru_cl->vruc_num_fds_sent = VHOST_MEMORY_MAX_NREGIONS;
                   }

                   memcpy(vru_cl->vruc_fds_sent, CMSG_DATA(cmsg),
                          vru_cl->vruc_num_fds_sent*sizeof(int));
            }

            vru_cl->vruc_msg_bytes_read = ret;
            if (ret < VHOST_USER_HSIZE) {
                ret = 0;
                goto cleanup;
            }

            read_len = vru_cl->vruc_msg.size;
        } else {
            /*
             * recvmsg returned 0, so return error.
             */
            vr_uvhost_log("Receive returned %d in vhost server for client %s\n",
                          ret, vru_cl->vruc_path);
            ret = -1;
            goto cleanup;
        }
    } else if (vru_cl->vruc_msg_bytes_read < VHOST_USER_HSIZE) {
        read_len = VHOST_USER_HSIZE - vru_cl->vruc_msg_bytes_read;
    } else {
        read_len = vru_cl->vruc_msg.size -
                       (vru_cl->vruc_msg_bytes_read - VHOST_USER_HSIZE);
    }

    if (read_len) {
        if (vru_cl->vruc_owner != pthread_self()) {
            if (vru_cl->vruc_owner)
                RTE_LOG(WARNING, UVHOST, "WARNING: thread %lx is trying to read"
                    " uvhost client FD %d owned by thread %lx\n",
                    pthread_self(), fd, vru_cl->vruc_owner);
            vru_cl->vruc_owner = pthread_self();
        }
        ret = read(fd, (((char *)&vru_cl->vruc_msg) + vru_cl->vruc_msg_bytes_read),
                   read_len);
#ifdef VR_DPDK_RX_PKT_DUMP
        if (ret > 0) {
            RTE_LOG(DEBUG, UVHOST, "%s[%lx]: FD %d read %d bytes\n", __func__,
                pthread_self(), fd, ret);
            rte_hexdump(stdout, "uvhost full message dump:",
                (((char *)&vru_cl->vruc_msg)),
                    ret + vru_cl->vruc_msg_bytes_read);
        } else if (ret < 0) {
            RTE_LOG(DEBUG, UVHOST, "%s[%lx]: FD %d read returned error %d: %s (%d)\n", __func__,
                pthread_self(), fd, ret, rte_strerror(errno), errno);
        }
#endif
        if (ret < 0) {
            if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) {
                ret = 0;
                goto cleanup;
            }

            vr_uvhost_log(
                "Error: read returned %d, %d %d %d in vhost server for client %s\n",
                ret, errno, read_len,
                vru_cl->vruc_msg_bytes_read, vru_cl->vruc_path);
            ret = -1;
            goto cleanup;
        } else if (ret == 0) {
             vr_uvhost_log("Read returned %d in vhost server for client %s\n",
                           ret, vru_cl->vruc_path);
            ret = -1;
            goto cleanup;
        }

        vru_cl->vruc_msg_bytes_read += ret;
        if (vru_cl->vruc_msg_bytes_read < VHOST_USER_HSIZE) {
            ret = 0;
            goto cleanup;
        }

        if (vru_cl->vruc_msg_bytes_read <
                (vru_cl->vruc_msg.size + VHOST_USER_HSIZE)) {
            ret = 0;
            goto cleanup;
        }
    }

    ret = vr_uvh_cl_call_handler(vru_cl);
    if (ret < 0) {
        vr_uvhost_log("Error handling message %d client %s\n",
                      vru_cl->vruc_msg.request, vru_cl->vruc_path);
        ret = -1;
        goto cleanup;
    }

    ret = vr_uvh_cl_send_reply(fd, vru_cl);
    if (ret < 0) {
        vr_uvhost_log("Error sending reply for message %d client %s\n",
                      vru_cl->vruc_msg.request, vru_cl->vruc_path);
        ret = -1;
        goto cleanup;
    }

cleanup:
    err = errno;
    /* close all the FDs received */
    for (i = 0; i < vru_cl->vruc_num_fds_sent; i++) {
        if (vru_cl->vruc_fds_sent[i] > 0)
            close(vru_cl->vruc_fds_sent[i]);
    }
    if (ret == -1) {
        /* set VQ_NOT_READY state to vif's queues. */
        for (i = 0; i < VR_DPDK_VIRTIO_MAX_QUEUES; i++) {
            vr_dpdk_virtio_rxqs[vru_cl->vruc_idx][i].vdv_ready_state = VQ_NOT_READY;
            vr_dpdk_virtio_txqs[vru_cl->vruc_idx][i].vdv_ready_state = VQ_NOT_READY;
        }
        rte_wmb();
        synchronize_rcu();
        /*
        * Unmaps qemu's FDs.
        */
        vr_dpdk_virtio_uvh_vif_munmap(&vr_dpdk_virtio_uvh_vif_mmap[vru_cl->vruc_idx]);
    }
    /* clear state for next message from this client. */
    vru_cl->vruc_msg_bytes_read = 0;
    memset(&vru_cl->vruc_msg, 0, sizeof(vru_cl->vruc_msg));
    memset(vru_cl->vruc_cmsg, 0, sizeof(vru_cl->vruc_cmsg));
    memset(vru_cl->vruc_fds_sent, 0, sizeof(vru_cl->vruc_fds_sent));
    vru_cl->vruc_num_fds_sent = 0;
    errno = err;
    return ret;
}
/*
 * vr_uvh_nl_vif_add_handler - handle a vif add message from the netlink
 * thread. In response, the vhost server thread starts listening on the
 * UNIX domain socket corresponding to this vif.
 *
 * Returns 0 on success, -1 otherwise.
 */
static int
vr_uvh_nl_vif_add_handler(vrnu_vif_add_t *msg)
{
    int s = 0, ret = -1, err;
    struct sockaddr_un sun;
    int flags;
    vr_uvh_client_t *vru_cl = NULL;
    mode_t umask_mode;

    if (msg == NULL) {
        vr_uvhost_log("    error adding vif %u: message is NULL\n",
                        msg->vrnu_vif_idx);
        return -1;
    }

    vr_uvhost_log("Adding vif %d virtual device %s\n", msg->vrnu_vif_idx,
                        msg->vrnu_vif_name);
    s = socket(AF_UNIX, SOCK_STREAM, 0);
    if (s == -1) {
        vr_uvhost_log("    error creating vif %u socket: %s (%d)\n",
                        msg->vrnu_vif_idx, rte_strerror(errno), errno);
        goto error;
    }
    vr_uvhost_log("    vif %u socket %s FD is %d\n",
                            msg->vrnu_vif_idx, msg->vrnu_vif_name, s);

    memset(&sun, 0, sizeof(sun));
    strncpy(sun.sun_path, VR_UVH_VIF_PREFIX, sizeof(sun.sun_path) - 1);
    strncat(sun.sun_path, msg->vrnu_vif_name,
        sizeof(sun.sun_path) - strlen(sun.sun_path) - 1);
    sun.sun_family = AF_UNIX;

    mkdir(VR_SOCKET_DIR, VR_SOCKET_DIR_MODE);
    unlink(sun.sun_path);

    /*
     * Ensure RW permissions for the socket files such that QEMU process is
     * able to connect.
     */
    umask_mode = umask(~(S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH |
            S_IWOTH));

    ret = bind(s, (struct sockaddr *) &sun, sizeof(sun));
    if (ret == -1) {
        vr_uvhost_log("    error binding vif %u FD %d to %s: %s (%d)\n",
            msg->vrnu_vif_idx, s, sun.sun_path, rte_strerror(errno), errno);
        goto error;
    }

    umask(umask_mode);

    /*
     * Set the socket to non-blocking
     */
    flags = fcntl(s, F_GETFL, 0);
    fcntl(s, flags | O_NONBLOCK);

    ret = listen(s, 1);
    if (ret == -1) {
        vr_uvhost_log("    error listening vif %u socket FD %d: %s (%d)\n",
                        msg->vrnu_vif_idx, s, rte_strerror(errno), errno);
        goto error;
    }

    vru_cl = vr_uvhost_new_client(s, sun.sun_path, msg->vrnu_vif_idx);
    if (vru_cl == NULL) {
        vr_uvhost_log("    error creating vif %u socket %s new vhost client\n",
                      msg->vrnu_vif_idx, sun.sun_path);
        goto error;
    }

    vru_cl->vruc_idx = msg->vrnu_vif_idx;
    vru_cl->vruc_nrxqs = msg->vrnu_vif_nrxqs;
    vru_cl->vruc_ntxqs = msg->vrnu_vif_ntxqs;

    ret = vr_uvhost_add_fd(s, UVH_FD_READ, vru_cl, vr_uvh_cl_listen_handler);
    if (ret == -1) {
        vr_uvhost_log("    error adding vif %u socket FD %d\n",
                        msg->vrnu_vif_idx, s);
        goto error;
    }

    vr_dpdk_virtio_set_vif_client(msg->vrnu_vif_idx, vru_cl);

    return 0;

error:

    err = errno;
    if (s) {
        close(s);
    }

    if (vru_cl) {
        vr_uvhost_del_client(vru_cl);
    }
    errno = err;

    return ret;
}
/*
 * vr_uvhm_set_mem_table - handles VHOST_USER_SET_MEM_TABLE message from
 * user space vhost client to learn the memory map of the guest.
 *
 * Returns 0 on success, -1 otherwise.
 */
static int
vr_uvhm_set_mem_table(vr_uvh_client_t *vru_cl)
{
    int i;
    int ret;
    vr_uvh_client_mem_region_t *region;
    VhostUserMemory *vum_msg;
    uint64_t size;
    vr_dpdk_uvh_vif_mmap_addr_t *const vif_mmap_addrs = (
                             &(vr_dpdk_virtio_uvh_vif_mmap[vru_cl->vruc_idx]));

    vum_msg = &vru_cl->vruc_msg.memory;
    vr_uvhost_log("Number of memory regions: %d\n", vum_msg->nregions);
    for (i = 0; i < vum_msg->nregions; i++) {
        vr_uvhost_log("Region %d: physical address 0x%" PRIx64 ", size 0x%"
                PRIx64 ", offset 0x%" PRIx64 "\n",
                i, vum_msg->regions[i].guest_phys_addr,
                vum_msg->regions[i].memory_size,
                vum_msg->regions[i].mmap_offset);

        if (vru_cl->vruc_fds_sent[i]) {
            region = &vru_cl->vruc_mem_regions[i];

            region->vrucmr_phys_addr = vum_msg->regions[i].guest_phys_addr;
            region->vrucmr_size = vum_msg->regions[i].memory_size;
            region->vrucmr_user_space_addr = vum_msg->regions[i].userspace_addr;

            size = vum_msg->regions[i].mmap_offset +
                       vum_msg->regions[i].memory_size;
            region->vrucmr_mmap_addr = (uint64_t)
                                            mmap(0,
                                            size,
                                            PROT_READ | PROT_WRITE,
                                            MAP_SHARED,
                                            vru_cl->vruc_fds_sent[i],
                                            0);

            if (region->vrucmr_mmap_addr == ((uint64_t)MAP_FAILED)) {
                vr_uvhost_log("mmap for size 0x%" PRIx64 " failed for FD %d"
                        " on vhost client %s (%s)\n",
                        size,
                        vru_cl->vruc_fds_sent[i],
                        vru_cl->vruc_path, rte_strerror(errno));
                /* the file descriptor is no longer needed */
                close(vru_cl->vruc_fds_sent[i]);
                vru_cl->vruc_fds_sent[i] = -1;
                return -1;
            }
            /* Set values for munmap(2) function. */
            ret = vr_dpdk_virtio_uvh_get_blk_size(vru_cl->vruc_fds_sent[i],
                                 &vif_mmap_addrs->vu_mmap_data[i].unmap_blksz);
            if (ret) {
                vr_uvhost_log("Get block size failed for FD %d on vhost client %s \n",
                              vru_cl->vruc_fds_sent[i], vru_cl->vruc_path);
                return -1;
            }

            vif_mmap_addrs->vu_mmap_data[i].unmap_mmap_addr = ((uint64_t)
                                                       region->vrucmr_mmap_addr);
            vif_mmap_addrs->vu_mmap_data[i].unmap_size = size;

            /* the file descriptor is no longer needed */
            close(vru_cl->vruc_fds_sent[i]);
            vru_cl->vruc_fds_sent[i] = -1;
            region->vrucmr_mmap_addr += vum_msg->regions[i].mmap_offset;
        }
    }

    /* Save the number of regions. */
    vru_cl->vruc_num_mem_regions = vum_msg->nregions;
    vif_mmap_addrs->vu_nregions = vum_msg->nregions;

    return 0;
}
static int
__usock_read(struct vr_usocket *usockp)
{
    int ret;
    unsigned int offset = usockp->usock_read_offset;
    unsigned int len = usockp->usock_read_len;
    unsigned int toread = len - offset;

    struct nlmsghdr *nlh;
    unsigned int proto = usockp->usock_proto;
    char *buf = usockp->usock_rx_buf;

    if (toread > usockp->usock_buf_len) {
        toread = usockp->usock_buf_len - offset;
    }

retry_read:
    if (usockp->usock_owner != pthread_self()) {
        if (usockp->usock_owner)
            RTE_LOG(WARNING, USOCK, "WARNING: thread %lx is trying to read"
                " usocket FD %d owned by thread %lx\n",
                pthread_self(), usockp->usock_fd, usockp->usock_owner);
        usockp->usock_owner = pthread_self();
    }
    ret = read(usockp->usock_fd, buf + offset, toread);
#ifdef VR_DPDK_USOCK_DUMP
    if (ret > 0) {
        RTE_LOG(DEBUG, USOCK, "%s[%lx]: FD %d read %d bytes\n", __func__,
            pthread_self(), usockp->usock_fd, ret);
        rte_hexdump(stdout, "usock buffer dump:", buf + offset, ret);
    } else if (ret < 0) {
        RTE_LOG(DEBUG, USOCK, "%s[%lx]: FD %d read returned error %d: %s (%d)\n", __func__,
            pthread_self(), usockp->usock_fd, ret, rte_strerror(errno), errno);
    }
#endif
    if (ret <= 0) {
        if (!ret)
            return -1;

        if (errno == EINTR)
            goto retry_read;

        if ((errno == EAGAIN) ||
                (errno == EWOULDBLOCK))
            return 0;

        RTE_LOG(ERR, USOCK, "Error reading FD %d: %s (%d)\n",
                usockp->usock_fd, rte_strerror(errno), errno);
        return ret;
    }

    offset += ret;
    usockp->usock_read_offset = offset;

    if (proto == NETLINK) {
        if (usockp->usock_state == READING_HEADER) {
            if (usockp->usock_read_offset == usockp->usock_read_len) {
                usockp->usock_state = READING_DATA;
                nlh = (struct nlmsghdr *)(usockp->usock_rx_buf);
                usockp->usock_read_len = nlh->nlmsg_len;
            }
        }

        if (usockp->usock_buf_len < usockp->usock_read_len) {
            usockp->usock_rx_buf = vr_malloc(usockp->usock_read_len,
                    VR_USOCK_BUF_OBJECT);
            if (!usockp->usock_rx_buf) {
                /* bad, but let's recover */
                usockp->usock_rx_buf = buf;
                usockp->usock_read_len -= usockp->usock_read_offset;
                usockp->usock_read_offset = 0;
                usockp->usock_state = READING_FAULTY_DATA;
            } else {
                memcpy(usockp->usock_rx_buf, buf, usockp->usock_read_offset);
                vr_free(buf, VR_USOCK_BUF_OBJECT);
                usockp->usock_buf_len = usockp->usock_read_len;
                buf = usockp->usock_rx_buf;
            }
        }
    } else if (proto == PACKET) {
        usockp->usock_read_len = ret;
    }

    return ret;
}
Beispiel #29
0
int
main(int argc, char **argv)
{
	int ret;
	unsigned nb_ports;
	unsigned int lcore_id, last_lcore_id, master_lcore_id;
	uint8_t port_id;
	uint8_t nb_ports_available;
	struct worker_thread_args worker_args = {NULL, NULL};
	struct send_thread_args send_args = {NULL, NULL};
	struct rte_ring *rx_to_workers;
	struct rte_ring *workers_to_tx;

	/* catch ctrl-c so we can print on exit */
	signal(SIGINT, int_handler);

	/* Initialize EAL */
	ret = rte_eal_init(argc, argv);
	if (ret < 0)
		return -1;

	argc -= ret;
	argv += ret;

	/* Parse the application specific arguments */
	ret = parse_args(argc, argv);
	if (ret < 0)
		return -1;

	/* Check if we have enought cores */
	if (rte_lcore_count() < 3)
		rte_exit(EXIT_FAILURE, "Error, This application needs at "
				"least 3 logical cores to run:\n"
				"1 lcore for packet RX\n"
				"1 lcore for packet TX\n"
				"and at least 1 lcore for worker threads\n");

	nb_ports = rte_eth_dev_count();
	if (nb_ports == 0)
		rte_exit(EXIT_FAILURE, "Error: no ethernet ports detected\n");
	if (nb_ports != 1 && (nb_ports & 1))
		rte_exit(EXIT_FAILURE, "Error: number of ports must be even, except "
				"when using a single port\n");

	mbuf_pool = rte_pktmbuf_pool_create("mbuf_pool", MBUF_PER_POOL,
			MBUF_POOL_CACHE_SIZE, 0, MBUF_DATA_SIZE,
			rte_socket_id());
	if (mbuf_pool == NULL)
		rte_exit(EXIT_FAILURE, "%s\n", rte_strerror(rte_errno));

	nb_ports_available = nb_ports;

	/* initialize all ports */
	for (port_id = 0; port_id < nb_ports; port_id++) {
		/* skip ports that are not enabled */
		if ((portmask & (1 << port_id)) == 0) {
			printf("\nSkipping disabled port %d\n", port_id);
			nb_ports_available--;
			continue;
		}
		/* init port */
		printf("Initializing port %u... done\n", (unsigned) port_id);

		if (configure_eth_port(port_id) != 0)
			rte_exit(EXIT_FAILURE, "Cannot initialize port %"PRIu8"\n",
					port_id);
	}

	if (!nb_ports_available) {
		rte_exit(EXIT_FAILURE,
			"All available ports are disabled. Please set portmask.\n");
	}

	/* Create rings for inter core communication */
	rx_to_workers = rte_ring_create("rx_to_workers", RING_SIZE, rte_socket_id(),
			RING_F_SP_ENQ);
	if (rx_to_workers == NULL)
		rte_exit(EXIT_FAILURE, "%s\n", rte_strerror(rte_errno));

	workers_to_tx = rte_ring_create("workers_to_tx", RING_SIZE, rte_socket_id(),
			RING_F_SC_DEQ);
	if (workers_to_tx == NULL)
		rte_exit(EXIT_FAILURE, "%s\n", rte_strerror(rte_errno));

	if (!disable_reorder) {
		send_args.buffer = rte_reorder_create("PKT_RO", rte_socket_id(),
				REORDER_BUFFER_SIZE);
		if (send_args.buffer == NULL)
			rte_exit(EXIT_FAILURE, "%s\n", rte_strerror(rte_errno));
	}

	last_lcore_id   = get_last_lcore_id();
	master_lcore_id = rte_get_master_lcore();

	worker_args.ring_in  = rx_to_workers;
	worker_args.ring_out = workers_to_tx;

	/* Start worker_thread() on all the available slave cores but the last 1 */
	for (lcore_id = 0; lcore_id <= get_previous_lcore_id(last_lcore_id); lcore_id++)
		if (rte_lcore_is_enabled(lcore_id) && lcore_id != master_lcore_id)
			rte_eal_remote_launch(worker_thread, (void *)&worker_args,
					lcore_id);

	if (disable_reorder) {
		/* Start tx_thread() on the last slave core */
		rte_eal_remote_launch((lcore_function_t *)tx_thread, workers_to_tx,
				last_lcore_id);
	} else {
		send_args.ring_in = workers_to_tx;
		/* Start send_thread() on the last slave core */
		rte_eal_remote_launch((lcore_function_t *)send_thread,
				(void *)&send_args, last_lcore_id);
	}

	/* Start rx_thread() on the master core */
	rx_thread(rx_to_workers);

	RTE_LCORE_FOREACH_SLAVE(lcore_id) {
		if (rte_eal_wait_lcore(lcore_id) < 0)
			return -1;
	}

	print_stats();
	return 0;
}
/* Init KNI RX queue */
struct vr_dpdk_queue *
vr_dpdk_kni_rx_queue_init(unsigned lcore_id, struct vr_interface *vif,
    unsigned host_lcore_id)
{
    struct vr_dpdk_lcore *lcore = vr_dpdk.lcores[lcore_id];
    const unsigned socket_id = rte_lcore_to_socket_id(lcore_id);
    uint8_t port_id = 0;
    unsigned vif_idx = vif->vif_idx;
    struct vr_dpdk_queue *rx_queue = &lcore->lcore_rx_queues[vif_idx];
    struct vr_dpdk_queue_params *rx_queue_params
                    = &lcore->lcore_rx_queue_params[vif_idx];

    if (vif->vif_type == VIF_TYPE_HOST) {
        port_id = (((struct vr_dpdk_ethdev *)(vif->vif_bridge->vif_os))->
                ethdev_port_id);
    }

    /* init queue */
    rx_queue->rxq_ops = dpdk_knidev_reader_ops;
    rx_queue->q_queue_h = NULL;
    rx_queue->q_vif = vrouter_get_interface(vif->vif_rid, vif_idx);

    /* create the queue */
    struct dpdk_knidev_reader_params reader_params = {
        .kni = vif->vif_os,
    };
    rx_queue->q_queue_h = rx_queue->rxq_ops.f_create(&reader_params, socket_id);
    if (rx_queue->q_queue_h == NULL) {
        RTE_LOG(ERR, VROUTER, "    error creating KNI device %s RX queue"
            " at eth device %" PRIu8 "\n", vif->vif_name, port_id);
        return NULL;
    }

    /* store queue params */
    rx_queue_params->qp_release_op = &dpdk_kni_rx_queue_release;

    return rx_queue;
}

/* Release KNI TX queue */
static void
dpdk_kni_tx_queue_release(unsigned lcore_id, struct vr_interface *vif)
{
    struct vr_dpdk_lcore *lcore = vr_dpdk.lcores[lcore_id];
    struct vr_dpdk_queue *tx_queue = &lcore->lcore_tx_queues[vif->vif_idx];
    struct vr_dpdk_queue_params *tx_queue_params
                        = &lcore->lcore_tx_queue_params[vif->vif_idx];

    tx_queue->txq_ops.f_tx = NULL;
    rte_wmb();

    /* flush and free the queue */
    if (tx_queue->txq_ops.f_free(tx_queue->q_queue_h)) {
        RTE_LOG(ERR, VROUTER, "    error freeing lcore %u KNI device TX queue\n",
                    lcore_id);
    }

    /* reset the queue */
    vrouter_put_interface(tx_queue->q_vif);
    memset(tx_queue, 0, sizeof(*tx_queue));
    memset(tx_queue_params, 0, sizeof(*tx_queue_params));
}

/* Init KNI TX queue */
struct vr_dpdk_queue *
vr_dpdk_kni_tx_queue_init(unsigned lcore_id, struct vr_interface *vif,
    unsigned host_lcore_id)
{
    struct vr_dpdk_lcore *lcore = vr_dpdk.lcores[lcore_id];
    const unsigned socket_id = rte_lcore_to_socket_id(lcore_id);
    uint8_t port_id = 0;
    unsigned vif_idx = vif->vif_idx;
    struct vr_dpdk_queue *tx_queue = &lcore->lcore_tx_queues[vif_idx];
    struct vr_dpdk_queue_params *tx_queue_params
                    = &lcore->lcore_tx_queue_params[vif_idx];
    struct vr_dpdk_ethdev *ethdev;

    if (vif->vif_type == VIF_TYPE_HOST) {
        ethdev = vif->vif_bridge->vif_os;
        if (ethdev == NULL) {
            RTE_LOG(ERR, VROUTER, "    error creating KNI device %s TX queue:"
                " bridge vif %u ethdev is not initialized\n",
                vif->vif_name, vif->vif_bridge->vif_idx);
            return NULL;
        }
        port_id = ethdev->ethdev_port_id;
    }

    /* init queue */
    tx_queue->txq_ops = dpdk_knidev_writer_ops;
    tx_queue->q_queue_h = NULL;
    tx_queue->q_vif = vrouter_get_interface(vif->vif_rid, vif_idx);

    /* create the queue */
    struct dpdk_knidev_writer_params writer_params = {
        .kni = vif->vif_os,
        .tx_burst_sz = VR_DPDK_TX_BURST_SZ,
    };
    tx_queue->q_queue_h = tx_queue->txq_ops.f_create(&writer_params, socket_id);
    if (tx_queue->q_queue_h == NULL) {
        RTE_LOG(ERR, VROUTER, "    error creating KNI device %s TX queue"
            " at eth device %" PRIu8 "\n", vif->vif_name, port_id);
        return NULL;
    }

    /* store queue params */
    tx_queue_params->qp_release_op = &dpdk_kni_tx_queue_release;

    return tx_queue;
}

/* Change KNI MTU size callback */
static int
dpdk_knidev_change_mtu(uint8_t port_id, unsigned new_mtu)
{
    struct vrouter *router = vrouter_get(0);
    struct vr_interface *vif;
    int i, ret;
    uint8_t ethdev_port_id, slave_port_id;
    struct vr_dpdk_ethdev *ethdev = NULL;

    RTE_LOG(INFO, VROUTER, "Changing eth device %" PRIu8 " MTU to %u\n",
                    port_id, new_mtu);
    if (port_id >= rte_eth_dev_count()) {
        RTE_LOG(ERR, VROUTER, "Error changing eth device %"PRIu8" MTU: invalid eth device\n", port_id);
        return -EINVAL;
    }

    /*
     * TODO: DPDK bond PMD does not implement mtu_set op, so we need to
     * set the MTU manually for all the slaves.
     */
    /* Bond vif uses first slave port ID. */
    if (router->vr_eth_if) {
        ethdev = (struct vr_dpdk_ethdev *)router->vr_eth_if->vif_os;
        if (ethdev && ethdev->ethdev_nb_slaves > 0) {
            for (i = 0; i < ethdev->ethdev_nb_slaves; i++) {
                if (port_id == ethdev->ethdev_slaves[i])
                    break;
            }
            /* Clear ethdev if no port match. */
            if (i >= ethdev->ethdev_nb_slaves)
                ethdev = NULL;
        }
    }
    if (ethdev && ethdev->ethdev_nb_slaves > 0) {
        for (i = 0; i < ethdev->ethdev_nb_slaves; i++) {
            slave_port_id = ethdev->ethdev_slaves[i];
            RTE_LOG(INFO, VROUTER, "    changing bond member eth device %" PRIu8
                " MTU to %u\n", slave_port_id, new_mtu);

            ret =  rte_eth_dev_set_mtu(slave_port_id, new_mtu);
            if (ret < 0) {
                RTE_LOG(ERR, VROUTER, "    error changing bond member eth device %" PRIu8
                    " MTU: %s (%d)\n", slave_port_id, rte_strerror(-ret), -ret);
                return ret;
            }
        }
    } else {
        ret =  rte_eth_dev_set_mtu(port_id, new_mtu);
        if (ret < 0) {
            RTE_LOG(ERR, VROUTER, "Error changing eth device %" PRIu8
                " MTU: %s (%d)\n", port_id, rte_strerror(-ret), -ret);
        }
        return ret;
    }

    /* On success, inform vrouter about new MTU */
    for (i = 0; i < router->vr_max_interfaces; i++) {
        vif = __vrouter_get_interface(router, i);
        if (vif && (vif->vif_type == VIF_TYPE_PHYSICAL)) {
            ethdev_port_id = (((struct vr_dpdk_ethdev *)(vif->vif_os))->
                        ethdev_port_id);
            if (ethdev_port_id == port_id) {
                /* Ethernet header size */
                new_mtu += sizeof(struct vr_eth);
                if (vr_dpdk.vlan_tag != VLAN_ID_INVALID) {
                    /* 802.1q header size */
                    new_mtu += sizeof(uint32_t);
                }
                vif->vif_mtu = new_mtu;
                if (vif->vif_bridge)
                    vif->vif_bridge->vif_mtu = new_mtu;
            }
        }
    }

    return 0;
}


/* Configure KNI state callback */
static int
dpdk_knidev_config_network_if(uint8_t port_id, uint8_t if_up)
{
    int ret = 0;

    RTE_LOG(INFO, VROUTER, "Configuring eth device %" PRIu8 " %s\n",
                    port_id, if_up ? "UP" : "DOWN");
    if (port_id >= rte_eth_dev_count() || port_id >= RTE_MAX_ETHPORTS) {
        RTE_LOG(ERR, VROUTER, "Invalid eth device %" PRIu8 "\n", port_id);
        return -EINVAL;
    }

    if (if_up)
        ret = rte_eth_dev_start(port_id);
    else
        rte_eth_dev_stop(port_id);

    if (ret < 0) {
        RTE_LOG(ERR, VROUTER, "Configuring eth device %" PRIu8 " UP"
                    "failed (%d)\n", port_id, ret);
    }

    return ret;
}

/* Init KNI */
int
vr_dpdk_knidev_init(uint8_t port_id, struct vr_interface *vif)
{
    int i;
    struct rte_eth_dev_info dev_info;
    struct rte_kni_conf kni_conf;
    struct rte_kni_ops kni_ops;
    struct rte_kni *kni;
    struct rte_config *rte_conf = rte_eal_get_configuration();

    if (!vr_dpdk.kni_inited) {
        /*
         * If the host does not support KNIs (i.e. RedHat), we'll get
         * a panic here.
         */
        rte_kni_init(VR_DPDK_MAX_KNI_INTERFACES);
        vr_dpdk.kni_inited = true;
    }

    /* get eth device info */
    memset(&dev_info, 0, sizeof(dev_info));
    rte_eth_dev_info_get(port_id, &dev_info);

    /* create KNI configuration */
    memset(&kni_conf, 0, sizeof(kni_conf));
    strncpy(kni_conf.name, (char *)vif->vif_name, sizeof(kni_conf.name) - 1);

    kni_conf.addr = dev_info.pci_dev->addr;
    kni_conf.id = dev_info.pci_dev->id;
    kni_conf.group_id = port_id;
    kni_conf.mbuf_size = VR_DPDK_MAX_PACKET_SZ;
    /*
     * Due to DPDK commit 41a6ebd, now to prevent packet reordering in KNI
     * we have to bind KNI kernel thread to a first online unused CPU.
     */
    for (i = 0; i < RTE_MAX_LCORE; i++) {
        if (lcore_config[i].detected
                && rte_conf->lcore_role[VR_DPDK_FWD_LCORE_ID + i] == ROLE_OFF) {
            kni_conf.force_bind = 1;
            kni_conf.core_id = i;
            RTE_LOG(INFO, VROUTER, "    bind KNI kernel thread to CPU %d\n", i);
            break;
        }
    }

    /* KNI options
     *
     * Changing state of the KNI interface can change state of the physical
     * interface. This is useful for the vhost, but not for the VLAN
     * forwarding interface.
     */
    if (vif->vif_type == VIF_TYPE_VLAN) {
        memset(&kni_ops, 0, sizeof(kni_ops));
    } else {
        kni_ops.port_id = port_id;
        kni_ops.change_mtu = dpdk_knidev_change_mtu;
        kni_ops.config_network_if = dpdk_knidev_config_network_if;
    }

    /* allocate KNI device */
    kni = rte_kni_alloc(vr_dpdk.rss_mempool, &kni_conf, &kni_ops);
    if (kni == NULL) {
        RTE_LOG(ERR, VROUTER, "    error allocation KNI device %s"
            " at eth device %" PRIu8 "\n", vif->vif_name, port_id);
        return -ENOMEM;
    }

    /* store pointer to KNI for further use */
    vif->vif_os = kni;

    /* add interface to the table of KNIs */
    for (i = 0; i < VR_DPDK_MAX_KNI_INTERFACES; i++) {
        if (vr_dpdk.knis[i] == NULL) {
            vr_dpdk.knis[i] = vif->vif_os;
            break;
        }
    }

    return 0;
}