/* Starts a Netlink "dump" operation, by sending 'request' to the kernel on a
 * Netlink socket created with the given 'protocol', and initializes 'dump' to
 * reflect the state of the operation.
 *
 * nlmsg_len in 'msg' will be finalized to match msg->size, and nlmsg_pid will
 * be set to the Netlink socket's pid, before the message is sent.  NLM_F_DUMP
 * and NLM_F_ACK will be set in nlmsg_flags.
 *
 * The design of this Netlink socket library ensures that the dump is reliable.
 *
 * This function provides no status indication.  An error status for the entire
 * dump operation is provided when it is completed by calling nl_dump_done().
 *
 * The caller is responsible for destroying 'request'.
 */
void
nl_dump_start(struct nl_dump *dump, int protocol, const struct ofpbuf *request)
{
    ofpbuf_init(&dump->buffer, 4096);
    dump->status = nl_pool_alloc(protocol, &dump->sock);
    if (dump->status) {
        return;
    }

    nl_msg_nlmsghdr(request)->nlmsg_flags |= NLM_F_DUMP | NLM_F_ACK;
    dump->status = nl_sock_send__(dump->sock, request,
                                  nl_sock_allocate_seq(dump->sock, 1), true);
    dump->seq = nl_msg_nlmsghdr(request)->nlmsg_seq;
}
/* Helper function for nl_dump_next(). */
static int
nl_dump_recv(struct nl_dump *dump)
{
    struct nlmsghdr *nlmsghdr;
    int retval;

    retval = nl_sock_recv__(dump->sock, &dump->buffer, true);
    if (retval) {
        return retval == EINTR ? EAGAIN : retval;
    }

    nlmsghdr = nl_msg_nlmsghdr(&dump->buffer);
    if (dump->seq != nlmsghdr->nlmsg_seq) {
        VLOG_DBG_RL(&rl, "ignoring seq %#"PRIx32" != expected %#"PRIx32,
                    nlmsghdr->nlmsg_seq, dump->seq);
        return EAGAIN;
    }

    if (nl_msg_nlmsgerr(&dump->buffer, &retval)) {
        VLOG_INFO_RL(&rl, "netlink dump request error (%s)",
                     ovs_strerror(retval));
        return retval && retval != EAGAIN ? retval : EPROTO;
    }

    return 0;
}
/* Starts a Netlink "dump" operation, by sending 'request' to the kernel on a
 * Netlink socket created with the given 'protocol', and initializes 'dump' to
 * reflect the state of the operation.
 *
 * 'request' must contain a Netlink message.  Before sending the message,
 * nlmsg_len will be finalized to match request->size, and nlmsg_pid will be
 * set to the Netlink socket's pid.  NLM_F_DUMP and NLM_F_ACK will be set in
 * nlmsg_flags.
 *
 * The design of this Netlink socket library ensures that the dump is reliable.
 *
 * This function provides no status indication.  nl_dump_done() provides an
 * error status for the entire dump operation.
 *
 * The caller must eventually destroy 'request'.
 */
void
nl_dump_start(struct nl_dump *dump, int protocol, const struct ofpbuf *request)
{
    nl_msg_nlmsghdr(request)->nlmsg_flags |= NLM_F_DUMP | NLM_F_ACK;

    ovs_mutex_init(&dump->mutex);
    ovs_mutex_lock(&dump->mutex);
    dump->status = nl_pool_alloc(protocol, &dump->sock);
    if (!dump->status) {
        dump->status = nl_sock_send__(dump->sock, request,
                                      nl_sock_allocate_seq(dump->sock, 1),
                                      true);
    }
    dump->nl_seq = nl_msg_nlmsghdr(request)->nlmsg_seq;
    ovs_mutex_unlock(&dump->mutex);
}
static int
nl_sock_send__(struct nl_sock *sock, const struct ofpbuf *msg,
               uint32_t nlmsg_seq, bool wait)
{
    struct nlmsghdr *nlmsg = nl_msg_nlmsghdr(msg);
    int error;

    nlmsg->nlmsg_len = ofpbuf_size(msg);
    nlmsg->nlmsg_seq = nlmsg_seq;
    nlmsg->nlmsg_pid = sock->pid;
    do {
        int retval;
#ifdef _WIN32
        bool result;
        DWORD last_error = 0;
        result = WriteFile(sock->handle, ofpbuf_data(msg), ofpbuf_size(msg),
                           &retval, NULL);
        last_error = GetLastError();
        if (last_error != ERROR_SUCCESS && !result) {
            retval = -1;
            errno = EAGAIN;
        }
#else
        retval = send(sock->fd, ofpbuf_data(msg), ofpbuf_size(msg), wait ? 0 : MSG_DONTWAIT);
#endif
        error = retval < 0 ? errno : 0;
    } while (error == EINTR);
    log_nlmsg(__func__, error, ofpbuf_data(msg), ofpbuf_size(msg), sock->protocol);
    if (!error) {
        COVERAGE_INC(netlink_sent);
    }
    return error;
}
/* Starts a Netlink "dump" operation, by sending 'request' to the kernel via
 * 'sock', and initializes 'dump' to reflect the state of the operation.
 *
 * nlmsg_len in 'msg' will be finalized to match msg->size, and nlmsg_pid will
 * be set to 'sock''s pid, before the message is sent.  NLM_F_DUMP and
 * NLM_F_ACK will be set in nlmsg_flags.
 *
 * This Netlink socket library is designed to ensure that the dump is reliable
 * and that it will not interfere with other operations on 'sock', including
 * destroying or sending and receiving messages on 'sock'.  One corner case is
 * not handled:
 *
 *   - If 'sock' has been used to send a request (e.g. with nl_sock_send())
 *     whose response has not yet been received (e.g. with nl_sock_recv()).
 *     This is unusual: usually nl_sock_transact() is used to send a message
 *     and receive its reply all in one go.
 *
 * This function provides no status indication.  An error status for the entire
 * dump operation is provided when it is completed by calling nl_dump_done().
 *
 * The caller is responsible for destroying 'request'.
 *
 * The new 'dump' is independent of 'sock'.  'sock' and 'dump' may be destroyed
 * in either order.
 */
void
nl_dump_start(struct nl_dump *dump,
              struct nl_sock *sock, const struct ofpbuf *request)
{
    ofpbuf_init(&dump->buffer, 4096);
    if (sock->dump) {
        /* 'sock' already has an ongoing dump.  Clone the socket because
         * Netlink only allows one dump at a time. */
        dump->status = nl_sock_clone(sock, &dump->sock);
        if (dump->status) {
            return;
        }
    } else {
        sock->dump = dump;
        dump->sock = sock;
        dump->status = 0;
    }

    nl_msg_nlmsghdr(request)->nlmsg_flags |= NLM_F_DUMP | NLM_F_ACK;
    dump->status = nl_sock_send__(sock, request, nl_sock_allocate_seq(sock, 1),
                                  true);
    dump->seq = nl_msg_nlmsghdr(request)->nlmsg_seq;
}
/* Encapsulates 'msg', which must contain an OpenFlow message, in a Netlink
 * message, and sends it to the OpenFlow local datapath numbered 'dp_idx' via
 * 'sock'.
 *
 * Returns 0 if successful, otherwise a positive errno value.  Returns EAGAIN
 * if the 'sock' send buffer is full.
 *
 * If the send is successful, then the kernel module will receive it, but there
 * is no guarantee that any reply will not be dropped (see nl_sock_transact()
 * for details). 
 */
int
dpif_send_openflow(struct dpif *dp, int dp_idx, struct ofpbuf *buffer)
{
    struct ofp_header *oh;
    unsigned int dump_flag;
    struct ofpbuf hdr;
    struct nlattr *nla;
    uint32_t fixed_buffer[64 / 4];
    struct iovec iov[3];
    int pad_bytes;
    int n_iov;
    int retval;

    /* The reply to OFPT_STATS_REQUEST may be multiple segments long, so we
     * need to specify NLM_F_DUMP in the request. */
    oh = ofpbuf_at_assert(buffer, 0, sizeof *oh);
    dump_flag = oh->type == OFPT_STATS_REQUEST ? NLM_F_DUMP : 0;

    ofpbuf_use(&hdr, fixed_buffer, sizeof fixed_buffer);
    nl_msg_put_genlmsghdr(&hdr, dp->sock, 32, openflow_family,
                          NLM_F_REQUEST | dump_flag, DP_GENL_C_OPENFLOW, 1);
    nl_msg_put_u32(&hdr, DP_GENL_A_DP_IDX, dp_idx);
    nla = ofpbuf_put_uninit(&hdr, sizeof *nla);
    nla->nla_len = sizeof *nla + buffer->size;
    nla->nla_type = DP_GENL_A_OPENFLOW;
    pad_bytes = NLA_ALIGN(nla->nla_len) - nla->nla_len;
    nl_msg_nlmsghdr(&hdr)->nlmsg_len = hdr.size + buffer->size + pad_bytes;
    n_iov = 2;
    iov[0].iov_base = hdr.data;
    iov[0].iov_len = hdr.size;
    iov[1].iov_base = buffer->data;
    iov[1].iov_len = buffer->size;
    if (pad_bytes) {
        static char zeros[NLA_ALIGNTO];
        n_iov++;
        iov[2].iov_base = zeros;
        iov[2].iov_len = pad_bytes; 
    }
    retval = nl_sock_sendv(dp->sock, iov, n_iov, false);
    if (retval && retval != EAGAIN) {
        VLOG_WARN_RL(&rl, "dpif_send_openflow: %s", strerror(retval));
    }
    return retval;
}
static int
nl_sock_send__(struct nl_sock *sock, const struct ofpbuf *msg,
               uint32_t nlmsg_seq, bool wait)
{
    struct nlmsghdr *nlmsg = nl_msg_nlmsghdr(msg);
    int error;

    nlmsg->nlmsg_len = msg->size;
    nlmsg->nlmsg_seq = nlmsg_seq;
    nlmsg->nlmsg_pid = sock->pid;
    do {
        int retval;
        retval = send(sock->fd, msg->data, msg->size, wait ? 0 : MSG_DONTWAIT);
        error = retval < 0 ? errno : 0;
    } while (error == EINTR);
    log_nlmsg(__func__, error, msg->data, msg->size, sock->protocol);
    if (!error) {
        COVERAGE_INC(netlink_sent);
    }
    return error;
}
/* Tries to receive an openflow message from datapath 'dp_idx' on 'sock'.  If
 * successful, stores the received message into '*msgp' and returns 0.  The
 * caller is responsible for destroying the message with ofpbuf_delete().  On
 * failure, returns a positive errno value and stores a null pointer into
 * '*msgp'.
 *
 * Only Netlink messages with embedded OpenFlow messages are accepted.  Other
 * Netlink messages provoke errors.
 *
 * If 'wait' is true, dpif_recv_openflow waits for a message to be ready;
 * otherwise, returns EAGAIN if the 'sock' receive buffer is empty. */
int
dpif_recv_openflow(struct dpif *dp, int dp_idx, struct ofpbuf **bufferp,
                   bool wait)
{
    struct nlattr *attrs[ARRAY_SIZE(openflow_policy)];
    struct ofpbuf *buffer;
    struct ofp_header *oh;
    uint16_t ofp_len;

    buffer = *bufferp = NULL;
    do {
        int retval;

        do {
            ofpbuf_delete(buffer);
            retval = nl_sock_recv(dp->sock, &buffer, wait);
        } while (retval == ENOBUFS
                 || (!retval
                     && (nl_msg_nlmsghdr(buffer)->nlmsg_type == NLMSG_DONE
                         || nl_msg_nlmsgerr(buffer, NULL))));
        if (retval) {
            if (retval != EAGAIN) {
                VLOG_WARN_RL(&rl, "dpif_recv_openflow: %s", strerror(retval));
            }
            return retval;
        }

        if (nl_msg_genlmsghdr(buffer) == NULL) {
            VLOG_DBG_RL(&rl, "received packet too short for Generic Netlink");
            goto error;
        }
        if (nl_msg_nlmsghdr(buffer)->nlmsg_type != openflow_family) {
            VLOG_DBG_RL(&rl,
                        "received type (%"PRIu16") != openflow family (%d)",
                        nl_msg_nlmsghdr(buffer)->nlmsg_type, openflow_family);
            goto error;
        }

        if (!nl_policy_parse(buffer, NLMSG_HDRLEN + GENL_HDRLEN,
                             openflow_policy, attrs,
                             ARRAY_SIZE(openflow_policy))) {
            goto error;
        }
    } while (nl_attr_get_u32(attrs[DP_GENL_A_DP_IDX]) != dp_idx);

    oh = buffer->data = (void *) nl_attr_get(attrs[DP_GENL_A_OPENFLOW]);
    buffer->size = nl_attr_get_size(attrs[DP_GENL_A_OPENFLOW]);
    ofp_len = ntohs(oh->length);
    if (ofp_len != buffer->size) {
        VLOG_WARN_RL(&rl,
                     "ofp_header.length %"PRIu16" != attribute length %zu\n",
                     ofp_len, buffer->size);
        buffer->size = MIN(ofp_len, buffer->size);
    }
    *bufferp = buffer;
    return 0;

error:
    ofpbuf_delete(buffer);
    return EPROTO;
}
static int
nl_sock_transact_multiple__(struct nl_sock *sock,
                            struct nl_transaction **transactions, size_t n,
                            size_t *done)
{
    uint64_t tmp_reply_stub[1024 / 8];
    struct nl_transaction tmp_txn;
    struct ofpbuf tmp_reply;

    uint32_t base_seq;
    struct iovec iovs[MAX_IOVS];
    struct msghdr msg;
    int error;
    int i;

    base_seq = nl_sock_allocate_seq(sock, n);
    *done = 0;
    for (i = 0; i < n; i++) {
        struct nl_transaction *txn = transactions[i];
        struct nlmsghdr *nlmsg = nl_msg_nlmsghdr(txn->request);

        nlmsg->nlmsg_len = txn->request->size;
        nlmsg->nlmsg_seq = base_seq + i;
        nlmsg->nlmsg_pid = sock->pid;

        iovs[i].iov_base = txn->request->data;
        iovs[i].iov_len = txn->request->size;
    }

    memset(&msg, 0, sizeof msg);
    msg.msg_iov = iovs;
    msg.msg_iovlen = n;
    do {
        error = sendmsg(sock->fd, &msg, 0) < 0 ? errno : 0;
    } while (error == EINTR);

    for (i = 0; i < n; i++) {
        struct nl_transaction *txn = transactions[i];

        log_nlmsg(__func__, error, txn->request->data, txn->request->size,
                  sock->protocol);
    }
    if (!error) {
        COVERAGE_ADD(netlink_sent, n);
    }

    if (error) {
        return error;
    }

    ofpbuf_use_stub(&tmp_reply, tmp_reply_stub, sizeof tmp_reply_stub);
    tmp_txn.request = NULL;
    tmp_txn.reply = &tmp_reply;
    tmp_txn.error = 0;
    while (n > 0) {
        struct nl_transaction *buf_txn, *txn;
        uint32_t seq;

        /* Find a transaction whose buffer we can use for receiving a reply.
         * If no such transaction is left, use tmp_txn. */
        buf_txn = &tmp_txn;
        for (i = 0; i < n; i++) {
            if (transactions[i]->reply) {
                buf_txn = transactions[i];
                break;
            }
        }

        /* Receive a reply. */
        error = nl_sock_recv__(sock, buf_txn->reply, false);
        if (error) {
            if (error == EAGAIN) {
                nl_sock_record_errors__(transactions, n, 0);
                *done += n;
                error = 0;
            }
            break;
        }

        /* Match the reply up with a transaction. */
        seq = nl_msg_nlmsghdr(buf_txn->reply)->nlmsg_seq;
        if (seq < base_seq || seq >= base_seq + n) {
            VLOG_DBG_RL(&rl, "ignoring unexpected seq %#"PRIx32, seq);
            continue;
        }
        i = seq - base_seq;
        txn = transactions[i];

        /* Fill in the results for 'txn'. */
        if (nl_msg_nlmsgerr(buf_txn->reply, &txn->error)) {
            if (txn->reply) {
                ofpbuf_clear(txn->reply);
            }
            if (txn->error) {
                VLOG_DBG_RL(&rl, "received NAK error=%d (%s)",
                            error, ovs_strerror(txn->error));
            }
        } else {
            txn->error = 0;
            if (txn->reply && txn != buf_txn) {
                /* Swap buffers. */
                struct ofpbuf *reply = buf_txn->reply;
                buf_txn->reply = txn->reply;
                txn->reply = reply;
            }
        }

        /* Fill in the results for transactions before 'txn'.  (We have to do
         * this after the results for 'txn' itself because of the buffer swap
         * above.) */
        nl_sock_record_errors__(transactions, i, 0);

        /* Advance. */
        *done += i + 1;
        transactions += i + 1;
        n -= i + 1;
        base_seq += i + 1;
    }
    ofpbuf_uninit(&tmp_reply);

    return error;
}